diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 437e01c37c6b6..609f23b20cfa6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -364,19 +364,14 @@ unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize, } unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { - if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS || - AddrSpace == AMDGPUAS::CONSTANT_ADDRESS || - AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT || - AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER || - AddrSpace == AMDGPUAS::BUFFER_RESOURCE || - AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) { - return 512; - } - if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) return 8 * ST->getMaxPrivateElementSize(); - // Common to flat, global, local and region. Assume for unknown addrspace. + // Common to other address spaces: flat, global, buffer, local and region. + // Assume for unknown addrspace. For constant, we also return 128 here despite + // support for wide scalar loads, because very large vectors can cause + // problems in the backend: high register pressure or increased + // fragmentation. return 128; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll index d4d5cb18bbd30..6930e0d809177 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll @@ -227,14 +227,14 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3 ; GFX10-LABEL: single_lane_execution_attribute: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_mov_b32 s12, 0 -; GFX10-NEXT: s_mov_b32 s13, -1 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: s_mov_b32 s9, -1 ; GFX10-NEXT: s_mov_b32 s2, s0 -; GFX10-NEXT: s_and_b64 s[4:5], s[4:5], s[12:13] -; GFX10-NEXT: s_mov_b32 s3, s12 +; GFX10-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9] +; GFX10-NEXT: s_mov_b32 s3, s8 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v1, -1, 0 ; GFX10-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX10-NEXT: v_mbcnt_hi_u32_b32 v1, -1, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v1 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v1 @@ -248,8 +248,8 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 ; GFX10-NEXT: s_cbranch_vccnz .LBB4_4 ; GFX10-NEXT: ; %bb.1: ; %.preheader.preheader -; GFX10-NEXT: v_mov_b32_e32 v3, s12 -; GFX10-NEXT: v_mov_b32_e32 v4, s12 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: v_mov_b32_e32 v4, s8 ; GFX10-NEXT: .LBB4_2: ; %.preheader ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: buffer_load_dword v5, v3, s[4:7], 0 offen @@ -261,18 +261,20 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3 ; GFX10-NEXT: s_cbranch_vccnz .LBB4_2 ; GFX10-NEXT: ; %bb.3: ; %.preheader._crit_edge ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 -; GFX10-NEXT: s_mov_b32 s13, 0 -; GFX10-NEXT: s_or_b32 s2, s0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 +; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: s_or_b32 s4, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 ; GFX10-NEXT: .LBB4_4: ; %Flow -; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s13 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x10 +; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s9 ; GFX10-NEXT: s_cbranch_vccz .LBB4_6 ; GFX10-NEXT: ; %bb.5: ; %.19 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: v_or_b32_e32 v3, 2, v1 ; GFX10-NEXT: .LBB4_6: ; %.22 ; GFX10-NEXT: v_add_lshl_u32 v0, v0, s1, 2 -; GFX10-NEXT: buffer_store_dword v3, v0, s[8:11], 0 offen +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_store_dword v3, v0, s[4:7], 0 offen ; GFX10-NEXT: s_endpgm .entry: %.0 = call i64 @llvm.amdgcn.s.getpc() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index b666f45521661..fd50ccb147901 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -845,140 +845,140 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) { ; GFX8-LABEL: sdivrem_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_ashr_i32 s2, s12, 31 -; GFX8-NEXT: s_add_i32 s0, s12, s2 -; GFX8-NEXT: s_xor_b32 s3, s0, s2 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX8-NEXT: s_sub_i32 s1, 0, s3 -; GFX8-NEXT: s_ashr_i32 s16, s13, 31 -; GFX8-NEXT: s_add_i32 s0, s13, s16 +; GFX8-NEXT: s_ashr_i32 s12, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s12 +; GFX8-NEXT: s_xor_b32 s13, s0, s12 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s13 +; GFX8-NEXT: s_ashr_i32 s15, s1, 31 +; GFX8-NEXT: s_add_i32 s0, s1, s15 +; GFX8-NEXT: s_sub_i32 s1, 0, s13 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_xor_b32 s13, s0, s16 -; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s13 -; GFX8-NEXT: s_ashr_i32 s12, s8, 31 +; GFX8-NEXT: s_xor_b32 s16, s0, s15 +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s16 +; GFX8-NEXT: s_ashr_i32 s14, s8, 31 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_add_i32 s0, s8, s12 -; GFX8-NEXT: s_xor_b32 s0, s0, s12 +; GFX8-NEXT: s_add_i32 s0, s8, s14 +; GFX8-NEXT: s_xor_b32 s0, s0, s14 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX8-NEXT: s_sub_i32 s8, 0, s13 +; GFX8-NEXT: s_sub_i32 s8, 0, s16 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX8-NEXT: v_mul_lo_u32 v2, v0, s3 +; GFX8-NEXT: v_mul_lo_u32 v2, v0, s13 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s3, v2 +; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s13, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s3, v2 +; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s13, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1 -; GFX8-NEXT: s_xor_b32 s0, s12, s2 -; GFX8-NEXT: s_ashr_i32 s2, s9, 31 -; GFX8-NEXT: s_add_i32 s1, s9, s2 +; GFX8-NEXT: s_ashr_i32 s8, s9, 31 +; GFX8-NEXT: s_add_i32 s1, s9, s8 +; GFX8-NEXT: s_xor_b32 s1, s1, s8 ; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX8-NEXT: s_xor_b32 s1, s1, s2 +; GFX8-NEXT: s_xor_b32 s0, s14, s12 ; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX8-NEXT: v_xor_b32_e32 v2, s12, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, s14, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1 -; GFX8-NEXT: s_ashr_i32 s3, s14, 31 +; GFX8-NEXT: s_ashr_i32 s9, s2, 31 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_mul_lo_u32 v3, v1, s13 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s12, v2 -; GFX8-NEXT: s_add_i32 s0, s14, s3 +; GFX8-NEXT: v_mul_lo_u32 v3, v1, s16 +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s14, v2 +; GFX8-NEXT: s_add_i32 s0, s2, s9 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s1, v3 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 -; GFX8-NEXT: s_xor_b32 s8, s0, s3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s16, v2 +; GFX8-NEXT: s_xor_b32 s2, s0, s9 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s8 -; GFX8-NEXT: v_subrev_u32_e64 v5, s[0:1], s13, v2 +; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s2 +; GFX8-NEXT: v_subrev_u32_e64 v5, s[0:1], s16, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 1, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s16, v2 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX8-NEXT: v_subrev_u32_e64 v5, s[0:1], s13, v2 -; GFX8-NEXT: s_sub_i32 s0, 0, s8 +; GFX8-NEXT: v_subrev_u32_e64 v5, s[0:1], s16, v2 +; GFX8-NEXT: s_sub_i32 s0, 0, s2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX8-NEXT: v_mul_lo_u32 v5, s0, v3 -; GFX8-NEXT: s_ashr_i32 s9, s10, 31 -; GFX8-NEXT: s_add_i32 s1, s10, s9 -; GFX8-NEXT: s_xor_b32 s1, s1, s9 +; GFX8-NEXT: s_ashr_i32 s12, s10, 31 +; GFX8-NEXT: s_add_i32 s1, s10, s12 +; GFX8-NEXT: s_xor_b32 s1, s1, s12 ; GFX8-NEXT: v_mul_hi_u32 v5, v3, v5 -; GFX8-NEXT: s_xor_b32 s0, s2, s16 -; GFX8-NEXT: v_xor_b32_e32 v2, s2, v2 +; GFX8-NEXT: s_xor_b32 s0, s8, s15 +; GFX8-NEXT: v_xor_b32_e32 v2, s8, v2 ; GFX8-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8-NEXT: v_mul_hi_u32 v3, s1, v3 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s2, v2 -; GFX8-NEXT: s_ashr_i32 s2, s15, 31 -; GFX8-NEXT: v_mul_lo_u32 v6, v3, s8 +; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s8, v2 +; GFX8-NEXT: s_ashr_i32 s8, s3, 31 +; GFX8-NEXT: v_mul_lo_u32 v6, v3, s2 ; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: s_add_i32 s0, s15, s2 +; GFX8-NEXT: s_add_i32 s0, s3, s8 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s1, v6 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 1, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 -; GFX8-NEXT: s_xor_b32 s10, s0, s2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 +; GFX8-NEXT: s_xor_b32 s3, s0, s8 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX8-NEXT: v_cvt_f32_u32_e32 v6, s10 -; GFX8-NEXT: v_subrev_u32_e64 v7, s[0:1], s8, v2 +; GFX8-NEXT: v_cvt_f32_u32_e32 v6, s3 +; GFX8-NEXT: v_subrev_u32_e64 v7, s[0:1], s2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; GFX8-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 1, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 ; GFX8-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX8-NEXT: v_subrev_u32_e64 v7, s[0:1], s8, v2 -; GFX8-NEXT: s_sub_i32 s0, 0, s10 +; GFX8-NEXT: v_subrev_u32_e64 v7, s[0:1], s2, v2 +; GFX8-NEXT: s_sub_i32 s0, 0, s3 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v2, v7, vcc ; GFX8-NEXT: v_mul_lo_u32 v2, s0, v6 -; GFX8-NEXT: s_xor_b32 s0, s9, s3 -; GFX8-NEXT: s_ashr_i32 s3, s11, 31 -; GFX8-NEXT: s_add_i32 s1, s11, s3 +; GFX8-NEXT: s_ashr_i32 s2, s11, 31 +; GFX8-NEXT: s_add_i32 s1, s11, s2 +; GFX8-NEXT: s_xor_b32 s1, s1, s2 ; GFX8-NEXT: v_mul_hi_u32 v2, v6, v2 -; GFX8-NEXT: s_xor_b32 s1, s1, s3 +; GFX8-NEXT: s_xor_b32 s0, s12, s9 ; GFX8-NEXT: v_xor_b32_e32 v3, s0, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 ; GFX8-NEXT: v_mul_hi_u32 v8, s1, v2 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v3 -; GFX8-NEXT: v_xor_b32_e32 v3, s9, v7 -; GFX8-NEXT: v_mul_lo_u32 v7, v8, s10 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s9, v3 +; GFX8-NEXT: v_xor_b32_e32 v3, s12, v7 +; GFX8-NEXT: v_mul_lo_u32 v7, v8, s3 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s12, v3 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s1, v7 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 1, v8 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s10, v3 +; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 1, v7 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc -; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s10, v3 +; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v3, v8, vcc -; GFX8-NEXT: s_xor_b32 s0, s3, s2 +; GFX8-NEXT: s_xor_b32 s0, s2, s8 ; GFX8-NEXT: v_xor_b32_e32 v3, s0, v7 -; GFX8-NEXT: v_xor_b32_e32 v7, s3, v8 +; GFX8-NEXT: v_xor_b32_e32 v7, s2, v8 ; GFX8-NEXT: v_mov_b32_e32 v9, s5 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s0, v3 ; GFX8-NEXT: v_mov_b32_e32 v8, s4 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s3, v7 +; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s2, v7 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] @@ -986,235 +986,236 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: sdivrem_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s6, s12, 31 -; GFX9-NEXT: s_add_i32 s0, s12, s6 -; GFX9-NEXT: s_xor_b32 s7, s0, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_ashr_i32 s4, s13, 31 -; GFX9-NEXT: s_add_i32 s5, s13, s4 +; GFX9-NEXT: s_ashr_i32 s12, s8, 31 +; GFX9-NEXT: s_add_i32 s0, s8, s12 +; GFX9-NEXT: s_xor_b32 s8, s0, s12 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX9-NEXT: s_ashr_i32 s13, s9, 31 +; GFX9-NEXT: s_add_i32 s9, s9, s13 +; GFX9-NEXT: s_xor_b32 s9, s9, s13 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s5, s5, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GFX9-NEXT: s_sub_i32 s13, 0, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GFX9-NEXT: s_sub_i32 s15, 0, s8 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_ashr_i32 s12, s8, 31 -; GFX9-NEXT: s_add_i32 s8, s8, s12 -; GFX9-NEXT: v_mul_lo_u32 v2, s13, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s14, s4, 31 +; GFX9-NEXT: v_mul_lo_u32 v2, s15, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_xor_b32 s8, s8, s12 +; GFX9-NEXT: s_add_i32 s4, s4, s14 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_sub_i32 s13, 0, s5 -; GFX9-NEXT: v_mul_lo_u32 v3, s13, v1 -; GFX9-NEXT: s_ashr_i32 s13, s9, 31 +; GFX9-NEXT: s_xor_b32 s4, s4, s14 +; GFX9-NEXT: s_sub_i32 s15, 0, s9 +; GFX9-NEXT: v_mul_lo_u32 v3, s15, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX9-NEXT: s_ashr_i32 s15, s5, 31 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3 -; GFX9-NEXT: s_add_i32 s9, s9, s13 -; GFX9-NEXT: s_xor_b32 s9, s9, s13 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, s7 +; GFX9-NEXT: s_add_i32 s5, s5, s15 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s8 +; GFX9-NEXT: s_xor_b32 s5, s5, s15 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX9-NEXT: v_mul_hi_u32 v1, s9, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v2, s7, v3 +; GFX9-NEXT: v_subrev_u32_e32 v2, s8, v3 +; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v2 -; GFX9-NEXT: s_xor_b32 s6, s12, s6 +; GFX9-NEXT: v_subrev_u32_e32 v3, s8, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s5 -; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 -; GFX9-NEXT: s_ashr_i32 s6, s14, 31 -; GFX9-NEXT: s_add_i32 s7, s14, s6 -; GFX9-NEXT: v_xor_b32_e32 v2, s12, v2 -; GFX9-NEXT: s_xor_b32 s7, s7, s6 -; GFX9-NEXT: v_subrev_u32_e32 v4, s12, v2 -; GFX9-NEXT: v_sub_u32_e32 v2, s9, v3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s7 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s9 +; GFX9-NEXT: s_xor_b32 s4, s14, s12 +; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_xor_b32_e32 v2, s14, v2 +; GFX9-NEXT: s_ashr_i32 s4, s10, 31 +; GFX9-NEXT: v_subrev_u32_e32 v4, s14, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, s5, v3 +; GFX9-NEXT: s_add_i32 s5, s10, s4 +; GFX9-NEXT: s_xor_b32 s5, s5, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s5 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX9-NEXT: v_subrev_u32_e32 v5, s5, v2 +; GFX9-NEXT: v_subrev_u32_e32 v5, s9, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 -; GFX9-NEXT: s_sub_i32 s8, 0, s7 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 +; GFX9-NEXT: s_sub_i32 s8, 0, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_mul_lo_u32 v5, s8, v3 -; GFX9-NEXT: s_xor_b32 s4, s13, s4 -; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 -; GFX9-NEXT: s_ashr_i32 s4, s15, 31 -; GFX9-NEXT: s_add_i32 s9, s15, s4 +; GFX9-NEXT: s_xor_b32 s8, s15, s13 +; GFX9-NEXT: v_xor_b32_e32 v1, s8, v1 +; GFX9-NEXT: v_subrev_u32_e32 v1, s8, v1 +; GFX9-NEXT: s_ashr_i32 s8, s11, 31 +; GFX9-NEXT: s_add_i32 s10, s11, s8 ; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 -; GFX9-NEXT: s_xor_b32 s9, s9, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s9 -; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v2 -; GFX9-NEXT: s_ashr_i32 s5, s10, 31 -; GFX9-NEXT: s_add_i32 s8, s10, s5 -; GFX9-NEXT: s_xor_b32 s8, s8, s5 +; GFX9-NEXT: s_xor_b32 s10, s10, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s10 +; GFX9-NEXT: v_subrev_u32_e32 v6, s9, v2 +; GFX9-NEXT: s_ashr_i32 s9, s6, 31 +; GFX9-NEXT: s_add_i32 s6, s6, s9 +; GFX9-NEXT: s_xor_b32 s6, s6, s9 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_mul_hi_u32 v3, s8, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, s6, v3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX9-NEXT: v_xor_b32_e32 v2, s13, v2 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, s7 +; GFX9-NEXT: v_xor_b32_e32 v2, s15, v2 +; GFX9-NEXT: v_mul_lo_u32 v6, v3, s5 ; GFX9-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX9-NEXT: v_subrev_u32_e32 v5, s13, v2 -; GFX9-NEXT: v_sub_u32_e32 v2, s8, v6 -; GFX9-NEXT: s_sub_i32 s8, 0, s9 -; GFX9-NEXT: v_mul_lo_u32 v8, s8, v7 +; GFX9-NEXT: v_subrev_u32_e32 v5, s15, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, s6, v6 +; GFX9-NEXT: s_sub_i32 s6, 0, s10 +; GFX9-NEXT: v_mul_lo_u32 v8, s6, v7 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v2 +; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GFX9-NEXT: v_mul_hi_u32 v8, v7, v8 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v2 -; GFX9-NEXT: s_ashr_i32 s7, s11, 31 -; GFX9-NEXT: s_add_i32 s8, s11, s7 -; GFX9-NEXT: s_xor_b32 s8, s8, s7 +; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v2 +; GFX9-NEXT: s_ashr_i32 s5, s7, 31 +; GFX9-NEXT: s_add_i32 s6, s7, s5 +; GFX9-NEXT: s_xor_b32 s6, s6, s5 ; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 -; GFX9-NEXT: v_mul_hi_u32 v7, s8, v7 -; GFX9-NEXT: s_xor_b32 s6, s5, s6 +; GFX9-NEXT: v_mul_hi_u32 v7, s6, v7 +; GFX9-NEXT: s_xor_b32 s4, s9, s4 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v6, vcc -; GFX9-NEXT: v_xor_b32_e32 v2, s6, v3 -; GFX9-NEXT: v_mul_lo_u32 v3, v7, s9 +; GFX9-NEXT: v_xor_b32_e32 v2, s4, v3 +; GFX9-NEXT: v_mul_lo_u32 v3, v7, s10 ; GFX9-NEXT: v_add_u32_e32 v8, 1, v7 -; GFX9-NEXT: s_xor_b32 s4, s7, s4 -; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v2 -; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 +; GFX9-NEXT: v_subrev_u32_e32 v2, s4, v2 +; GFX9-NEXT: s_xor_b32 s4, s5, s8 +; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc -; GFX9-NEXT: v_subrev_u32_e32 v8, s9, v3 +; GFX9-NEXT: v_subrev_u32_e32 v8, s10, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc ; GFX9-NEXT: v_add_u32_e32 v8, 1, v7 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc -; GFX9-NEXT: v_subrev_u32_e32 v8, s9, v3 +; GFX9-NEXT: v_subrev_u32_e32 v8, s10, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v8, vcc ; GFX9-NEXT: v_xor_b32_e32 v3, s4, v7 -; GFX9-NEXT: v_xor_b32_e32 v6, s5, v6 +; GFX9-NEXT: v_xor_b32_e32 v6, s9, v6 ; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_xor_b32_e32 v7, s7, v8 +; GFX9-NEXT: v_xor_b32_e32 v7, s5, v8 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v6 -; GFX9-NEXT: v_subrev_u32_e32 v7, s7, v7 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_subrev_u32_e32 v6, s9, v6 +; GFX9-NEXT: v_subrev_u32_e32 v7, s5, v7 ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdivrem_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ashr_i32 s0, s12, 31 -; GFX10-NEXT: s_ashr_i32 s1, s13, 31 -; GFX10-NEXT: s_ashr_i32 s2, s14, 31 -; GFX10-NEXT: s_ashr_i32 s3, s15, 31 -; GFX10-NEXT: s_add_i32 s6, s12, s0 -; GFX10-NEXT: s_add_i32 s7, s13, s1 -; GFX10-NEXT: s_add_i32 s12, s14, s2 -; GFX10-NEXT: s_add_i32 s13, s15, s3 -; GFX10-NEXT: s_xor_b32 s14, s6, s0 -; GFX10-NEXT: s_xor_b32 s15, s7, s1 -; GFX10-NEXT: s_xor_b32 s12, s12, s2 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s14 -; GFX10-NEXT: s_xor_b32 s13, s13, s3 -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s15 -; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s12 -; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s13 +; GFX10-NEXT: s_ashr_i32 s4, s0, 31 +; GFX10-NEXT: s_ashr_i32 s5, s1, 31 +; GFX10-NEXT: s_ashr_i32 s6, s2, 31 +; GFX10-NEXT: s_ashr_i32 s7, s3, 31 +; GFX10-NEXT: s_add_i32 s0, s0, s4 +; GFX10-NEXT: s_add_i32 s1, s1, s5 +; GFX10-NEXT: s_add_i32 s2, s2, s6 +; GFX10-NEXT: s_add_i32 s3, s3, s7 +; GFX10-NEXT: s_xor_b32 s16, s0, s4 +; GFX10-NEXT: s_xor_b32 s17, s1, s5 +; GFX10-NEXT: s_xor_b32 s18, s2, s6 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s16 +; GFX10-NEXT: s_xor_b32 s3, s3, s7 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s17 +; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s18 +; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s3 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_sub_i32 s6, 0, s14 +; GFX10-NEXT: s_sub_i32 s0, 0, s16 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX10-NEXT: s_sub_i32 s7, 0, s15 -; GFX10-NEXT: s_sub_i32 s19, 0, s12 -; GFX10-NEXT: s_ashr_i32 s16, s8, 31 -; GFX10-NEXT: s_ashr_i32 s17, s9, 31 -; GFX10-NEXT: s_ashr_i32 s18, s10, 31 +; GFX10-NEXT: s_sub_i32 s1, 0, s17 +; GFX10-NEXT: s_sub_i32 s2, 0, s18 +; GFX10-NEXT: s_ashr_i32 s19, s12, 31 +; GFX10-NEXT: s_ashr_i32 s20, s13, 31 +; GFX10-NEXT: s_ashr_i32 s21, s14, 31 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX10-NEXT: s_xor_b32 s20, s16, s0 +; GFX10-NEXT: s_ashr_i32 s22, s15, 31 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: s_xor_b32 s21, s17, s1 +; GFX10-NEXT: s_xor_b32 s4, s19, s4 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_lo_u32 v4, s6, v0 -; GFX10-NEXT: s_sub_i32 s6, 0, s13 -; GFX10-NEXT: v_mul_lo_u32 v5, s7, v1 -; GFX10-NEXT: v_mul_lo_u32 v6, s19, v2 -; GFX10-NEXT: v_mul_lo_u32 v7, s6, v3 -; GFX10-NEXT: s_ashr_i32 s19, s11, 31 -; GFX10-NEXT: s_add_i32 s6, s8, s16 -; GFX10-NEXT: s_add_i32 s7, s9, s17 +; GFX10-NEXT: v_mul_lo_u32 v4, s0, v0 +; GFX10-NEXT: s_sub_i32 s0, 0, s3 +; GFX10-NEXT: v_mul_lo_u32 v5, s1, v1 +; GFX10-NEXT: v_mul_lo_u32 v6, s2, v2 +; GFX10-NEXT: v_mul_lo_u32 v7, s0, v3 +; GFX10-NEXT: s_add_i32 s0, s12, s19 +; GFX10-NEXT: s_add_i32 s1, s13, s20 +; GFX10-NEXT: s_add_i32 s2, s14, s21 ; GFX10-NEXT: v_mul_hi_u32 v4, v0, v4 -; GFX10-NEXT: s_add_i32 s8, s10, s18 +; GFX10-NEXT: s_add_i32 s12, s15, s22 ; GFX10-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX10-NEXT: v_mul_hi_u32 v6, v2, v6 ; GFX10-NEXT: v_mul_hi_u32 v7, v3, v7 -; GFX10-NEXT: s_add_i32 s9, s11, s19 -; GFX10-NEXT: s_xor_b32 s10, s6, s16 -; GFX10-NEXT: s_xor_b32 s11, s7, s17 +; GFX10-NEXT: s_xor_b32 s0, s0, s19 +; GFX10-NEXT: s_xor_b32 s1, s1, s20 +; GFX10-NEXT: s_xor_b32 s2, s2, s21 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v4 -; GFX10-NEXT: s_xor_b32 s8, s8, s18 +; GFX10-NEXT: s_xor_b32 s12, s12, s22 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v5 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v6 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v7 -; GFX10-NEXT: s_xor_b32 s9, s9, s19 -; GFX10-NEXT: v_mul_hi_u32 v0, s10, v0 -; GFX10-NEXT: v_mul_hi_u32 v1, s11, v1 -; GFX10-NEXT: v_mul_hi_u32 v2, s8, v2 -; GFX10-NEXT: v_mul_hi_u32 v3, s9, v3 -; GFX10-NEXT: s_xor_b32 s22, s18, s2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX10-NEXT: v_mul_lo_u32 v4, v0, s14 -; GFX10-NEXT: v_mul_lo_u32 v5, v1, s15 -; GFX10-NEXT: v_mul_lo_u32 v6, v2, s12 -; GFX10-NEXT: v_mul_lo_u32 v7, v3, s13 +; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX10-NEXT: s_xor_b32 s5, s20, s5 +; GFX10-NEXT: v_mul_hi_u32 v1, s1, v1 +; GFX10-NEXT: v_mul_hi_u32 v2, s2, v2 +; GFX10-NEXT: v_mul_hi_u32 v3, s12, v3 +; GFX10-NEXT: s_xor_b32 s6, s21, s6 +; GFX10-NEXT: v_mul_lo_u32 v4, v0, s16 ; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v0 +; GFX10-NEXT: v_mul_lo_u32 v5, v1, s17 +; GFX10-NEXT: v_mul_lo_u32 v6, v2, s18 +; GFX10-NEXT: v_mul_lo_u32 v7, v3, s3 ; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v3 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, s10, v4 -; GFX10-NEXT: v_sub_nc_u32_e32 v5, s11, v5 -; GFX10-NEXT: v_sub_nc_u32_e32 v6, s8, v6 -; GFX10-NEXT: v_sub_nc_u32_e32 v7, s9, v7 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v4 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s15, v5 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s12, v6 -; GFX10-NEXT: v_cmp_le_u32_e64 s2, s13, v7 +; GFX10-NEXT: v_sub_nc_u32_e32 v4, s0, v4 +; GFX10-NEXT: v_sub_nc_u32_e32 v5, s1, v5 +; GFX10-NEXT: v_sub_nc_u32_e32 v6, s2, v6 +; GFX10-NEXT: v_sub_nc_u32_e32 v7, s12, v7 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s16, v4 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s17, v5 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s18, v6 +; GFX10-NEXT: v_cmp_le_u32_e64 s2, s3, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s14, v4 +; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s16, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s15, v5 +; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s17, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v10, s1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s12, v6 +; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s18, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s13, v7 +; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s3, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v9, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v10, s1 @@ -1223,43 +1224,42 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v3 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v4 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s15, v5 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s12, v6 -; GFX10-NEXT: v_cmp_le_u32_e64 s2, s13, v7 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s16, v4 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s17, v5 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s18, v6 +; GFX10-NEXT: v_cmp_le_u32_e64 s2, s3, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s14, v4 +; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s16, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s15, v5 +; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s17, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v10, s1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s12, v6 +; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s18, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s13, v7 +; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s3, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v9, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v10, s1 -; GFX10-NEXT: s_xor_b32 s0, s19, s3 +; GFX10-NEXT: s_xor_b32 s0, s22, s7 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s2 -; GFX10-NEXT: v_xor_b32_e32 v0, s20, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s21, v1 -; GFX10-NEXT: v_xor_b32_e32 v2, s22, v2 +; GFX10-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX10-NEXT: v_xor_b32_e32 v1, s5, v1 +; GFX10-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX10-NEXT: v_xor_b32_e32 v3, s0, v3 -; GFX10-NEXT: v_xor_b32_e32 v4, s16, v4 -; GFX10-NEXT: v_xor_b32_e32 v5, s17, v5 -; GFX10-NEXT: v_xor_b32_e32 v6, s18, v6 -; GFX10-NEXT: v_xor_b32_e32 v7, s19, v7 -; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s20, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s21, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s22, v2 +; GFX10-NEXT: v_xor_b32_e32 v4, s19, v4 +; GFX10-NEXT: v_xor_b32_e32 v5, s20, v5 +; GFX10-NEXT: v_xor_b32_e32 v6, s21, v6 +; GFX10-NEXT: v_xor_b32_e32 v7, s22, v7 +; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s4, v0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s5, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s6, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s0, v3 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s16, v4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s17, v5 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s18, v6 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s19, v7 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] -; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s19, v4 +; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s20, v5 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s21, v6 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s22, v7 +; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9] +; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[10:11] ; GFX10-NEXT: s_endpgm %div = sdiv <4 x i32> %x, %y store <4 x i32> %div, ptr addrspace(1) %out0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index a58397eccaba7..f186b1139662e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -685,8 +685,8 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) { ; GFX8-LABEL: udivrem_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s13 @@ -783,15 +783,16 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; ; GFX9-LABEL: udivrem_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX9-NEXT: s_sub_i32 s0, 0, s12 -; GFX9-NEXT: s_sub_i32 s1, 0, s13 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GFX9-NEXT: s_sub_i32 s0, 0, s8 +; GFX9-NEXT: s_sub_i32 s1, 0, s9 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s14 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s10 +; GFX9-NEXT: s_sub_i32 s12, 0, s10 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -799,88 +800,87 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GFX9-NEXT: v_mul_lo_u32 v2, s0, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s1, v1 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_sub_i32 s4, 0, s14 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX9-NEXT: v_mul_hi_u32 v1, s9, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, s12 -; GFX9-NEXT: v_mul_lo_u32 v4, v1, s13 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s8 +; GFX9-NEXT: v_mul_lo_u32 v4, v1, s9 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 -; GFX9-NEXT: v_sub_u32_e32 v7, s9, v4 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 -; GFX9-NEXT: v_subrev_u32_e32 v4, s12, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 +; GFX9-NEXT: v_sub_u32_e32 v7, s5, v4 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s8, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s12, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s8, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s4, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s15 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s13, v7 +; GFX9-NEXT: v_mul_lo_u32 v3, s12, v2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s11 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GFX9-NEXT: v_subrev_u32_e32 v6, s13, v7 +; GFX9-NEXT: v_subrev_u32_e32 v6, s9, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc ; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v5 -; GFX9-NEXT: v_mul_hi_u32 v2, s10, v2 +; GFX9-NEXT: v_mul_hi_u32 v2, s6, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX9-NEXT: v_add_u32_e32 v7, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s13, v6 -; GFX9-NEXT: s_sub_i32 s4, 0, s15 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v6 +; GFX9-NEXT: s_sub_i32 s4, 0, s11 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v2, s14 +; GFX9-NEXT: v_mul_lo_u32 v7, v2, s10 ; GFX9-NEXT: v_mul_lo_u32 v8, s4, v3 -; GFX9-NEXT: v_subrev_u32_e32 v5, s13, v6 +; GFX9-NEXT: v_subrev_u32_e32 v5, s9, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GFX9-NEXT: v_sub_u32_e32 v6, s10, v7 +; GFX9-NEXT: v_sub_u32_e32 v6, s6, v7 ; GFX9-NEXT: v_mul_hi_u32 v7, v3, v8 ; GFX9-NEXT: v_add_u32_e32 v8, 1, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v6 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v3, v7 -; GFX9-NEXT: v_mul_hi_u32 v3, s11, v3 -; GFX9-NEXT: v_subrev_u32_e32 v7, s14, v6 +; GFX9-NEXT: v_mul_hi_u32 v3, s7, v3 +; GFX9-NEXT: v_subrev_u32_e32 v7, s10, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc ; GFX9-NEXT: v_add_u32_e32 v7, 1, v2 -; GFX9-NEXT: v_mul_lo_u32 v8, v3, s15 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v6 +; GFX9-NEXT: v_mul_lo_u32 v8, v3, s11 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc -; GFX9-NEXT: v_subrev_u32_e32 v7, s14, v6 +; GFX9-NEXT: v_subrev_u32_e32 v7, s10, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc -; GFX9-NEXT: v_sub_u32_e32 v7, s11, v8 +; GFX9-NEXT: v_sub_u32_e32 v7, s7, v8 ; GFX9-NEXT: v_add_u32_e32 v8, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s15, v7 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; GFX9-NEXT: v_subrev_u32_e32 v8, s15, v7 +; GFX9-NEXT: v_subrev_u32_e32 v8, s11, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX9-NEXT: v_add_u32_e32 v8, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s15, v7 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; GFX9-NEXT: v_subrev_u32_e32 v8, s15, v7 +; GFX9-NEXT: v_subrev_u32_e32 v8, s11, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s13 diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll index 422e2747094ce..fc715161b040a 100644 --- a/llvm/test/CodeGen/AMDGPU/add.ll +++ b/llvm/test/CodeGen/AMDGPU/add.ll @@ -313,7 +313,8 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x i32> %b) { ; GFX6-LABEL: s_add_v8i32: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x11 +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; GFX6-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0x19 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 @@ -341,7 +342,8 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x ; ; GFX8-LABEL: s_add_v8i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; GFX8-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0x64 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_add_i32 s7, s7, s15 @@ -372,7 +374,8 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x ; ; GFX9-LABEL: s_add_v8i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; GFX9-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0x64 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -399,7 +402,9 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x ; ; GFX10-LABEL: s_add_v8i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; GFX10-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0x64 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -425,8 +430,9 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x ; ; GFX11-LABEL: s_add_v8i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x44 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x44 +; GFX11-NEXT: s_load_b256 s[12:19], s[0:1], 0x64 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_i32 s2, s7, s15 @@ -451,8 +457,9 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x ; ; GFX12-LABEL: s_add_v8i32: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b512 s[4:19], s[0:1], 0x44 +; GFX12-NEXT: s_clause 0x2 +; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x44 +; GFX12-NEXT: s_load_b256 s[12:19], s[0:1], 0x64 ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_co_i32 s2, s7, s15 @@ -1178,20 +1185,28 @@ entry: define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) { ; GFX6-LABEL: add64_in_branch: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; GFX6-NEXT: s_mov_b64 s[8:9], 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 -; GFX6-NEXT: s_and_b64 vcc, exec, s[10:11] -; GFX6-NEXT: s_cbranch_vccz .LBB9_4 +; GFX6-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 +; GFX6-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX6-NEXT: s_cbranch_vccz .LBB9_2 ; GFX6-NEXT: ; %bb.1: ; %else ; GFX6-NEXT: s_add_u32 s4, s4, s6 ; GFX6-NEXT: s_addc_u32 s5, s5, s7 +; GFX6-NEXT: s_branch .LBB9_3 +; GFX6-NEXT: .LBB9_2: +; GFX6-NEXT: s_mov_b64 s[8:9], -1 +; GFX6-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX6-NEXT: .LBB9_3: ; %Flow +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; GFX6-NEXT: s_cbranch_vccnz .LBB9_3 -; GFX6-NEXT: .LBB9_2: ; %if +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b64 vcc, vcc +; GFX6-NEXT: s_cbranch_vccnz .LBB9_5 +; GFX6-NEXT: ; %bb.4: ; %if ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX6-NEXT: .LBB9_3: ; %endif +; GFX6-NEXT: .LBB9_5: ; %endif ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -1199,25 +1214,29 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm -; GFX6-NEXT: .LBB9_4: -; GFX6-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX6-NEXT: s_branch .LBB9_2 ; ; GFX8-LABEL: add64_in_branch: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX8-NEXT: s_mov_b64 s[8:9], 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX8-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX8-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX8-NEXT: ; %bb.1: ; %else ; GFX8-NEXT: s_add_u32 s4, s4, s6 ; GFX8-NEXT: s_addc_u32 s5, s5, s7 +; GFX8-NEXT: s_branch .LBB9_3 +; GFX8-NEXT: .LBB9_2: +; GFX8-NEXT: s_mov_b64 s[8:9], -1 +; GFX8-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX8-NEXT: .LBB9_3: ; %Flow +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; GFX8-NEXT: s_cbranch_vccnz .LBB9_3 -; GFX8-NEXT: .LBB9_2: ; %if +; GFX8-NEXT: s_cbranch_vccnz .LBB9_5 +; GFX8-NEXT: ; %bb.4: ; %if +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX8-NEXT: .LBB9_3: ; %endif +; GFX8-NEXT: .LBB9_5: ; %endif ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1225,71 +1244,87 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm -; GFX8-NEXT: .LBB9_4: -; GFX8-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX8-NEXT: s_branch .LBB9_2 ; ; GFX9-LABEL: add64_in_branch: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX9-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX9-NEXT: ; %bb.1: ; %else ; GFX9-NEXT: s_add_u32 s4, s4, s6 ; GFX9-NEXT: s_addc_u32 s5, s5, s7 +; GFX9-NEXT: s_branch .LBB9_3 +; GFX9-NEXT: .LBB9_2: +; GFX9-NEXT: s_mov_b64 s[8:9], -1 +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-NEXT: .LBB9_3: ; %Flow +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; GFX9-NEXT: s_cbranch_vccnz .LBB9_3 -; GFX9-NEXT: .LBB9_2: ; %if +; GFX9-NEXT: s_cbranch_vccnz .LBB9_5 +; GFX9-NEXT: ; %bb.4: ; %if +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX9-NEXT: .LBB9_3: ; %endif +; GFX9-NEXT: .LBB9_5: ; %endif ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm -; GFX9-NEXT: .LBB9_4: -; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX9-NEXT: s_branch .LBB9_2 ; ; GFX10-LABEL: add64_in_branch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX10-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX10-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX10-NEXT: ; %bb.1: ; %else ; GFX10-NEXT: s_add_u32 s4, s4, s6 ; GFX10-NEXT: s_addc_u32 s5, s5, s7 -; GFX10-NEXT: s_cbranch_execnz .LBB9_3 -; GFX10-NEXT: .LBB9_2: ; %if +; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: s_branch .LBB9_3 +; GFX10-NEXT: .LBB9_2: +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX10-NEXT: .LBB9_3: ; %Flow +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s6 +; GFX10-NEXT: s_cbranch_vccnz .LBB9_5 +; GFX10-NEXT: ; %bb.4: ; %if +; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX10-NEXT: .LBB9_3: ; %endif +; GFX10-NEXT: .LBB9_5: ; %endif ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm -; GFX10-NEXT: .LBB9_4: -; GFX10-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX10-NEXT: s_branch .LBB9_2 ; ; GFX11-LABEL: add64_in_branch: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX11-NEXT: ; %bb.1: ; %else ; GFX11-NEXT: s_add_u32 s4, s4, s6 ; GFX11-NEXT: s_addc_u32 s5, s5, s7 -; GFX11-NEXT: s_cbranch_execnz .LBB9_3 -; GFX11-NEXT: .LBB9_2: ; %if +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_branch .LBB9_3 +; GFX11-NEXT: .LBB9_2: +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX11-NEXT: .LBB9_3: ; %Flow +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_5 +; GFX11-NEXT: ; %bb.4: ; %if +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 -; GFX11-NEXT: .LBB9_3: ; %endif +; GFX11-NEXT: .LBB9_5: ; %endif ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 @@ -1297,22 +1332,28 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm -; GFX11-NEXT: .LBB9_4: -; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX11-NEXT: s_branch .LBB9_2 ; ; GFX12-LABEL: add64_in_branch: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX12-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX12-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX12-NEXT: ; %bb.1: ; %else ; GFX12-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7] -; GFX12-NEXT: s_cbranch_execnz .LBB9_3 -; GFX12-NEXT: .LBB9_2: ; %if +; GFX12-NEXT: s_mov_b32 s6, 0 +; GFX12-NEXT: s_branch .LBB9_3 +; GFX12-NEXT: .LBB9_2: +; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX12-NEXT: .LBB9_3: ; %Flow +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccnz .LBB9_5 +; GFX12-NEXT: ; %bb.4: ; %if +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 -; GFX12-NEXT: .LBB9_3: ; %endif +; GFX12-NEXT: .LBB9_5: ; %endif ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, s4 ; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 @@ -1320,9 +1361,6 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm -; GFX12-NEXT: .LBB9_4: -; GFX12-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX12-NEXT: s_branch .LBB9_2 entry: %0 = icmp eq i64 %a, 0 br i1 %0, label %if, label %else diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll index 6f67ce4de9ce5..92d7003bbfee2 100644 --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -105,9 +105,9 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace ; VI-NEXT: s_lshr_b32 s3, s0, 16 ; VI-NEXT: s_add_i32 s2, s2, s0 ; VI-NEXT: s_add_i32 s1, s1, s3 -; VI-NEXT: s_and_b32 s0, s2, 0xffff -; VI-NEXT: s_lshl_b32 s1, s1, 16 -; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_lshl_b32 s0, s1, 16 +; VI-NEXT: s_and_b32 s1, s2, 0xffff +; VI-NEXT: s_or_b32 s0, s1, s0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll index 6dfc832ff3ac9..9f9563437b748 100644 --- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll +++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll @@ -5,49 +5,41 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext ; CHECK-LABEL: blender_no_live_segment_at_def_error: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_add_u32 s10, s10, s15 -; CHECK-NEXT: s_addc_u32 s11, s11, 0 ; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_addc_u32 s11, s11, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_load_dwordx8 s[36:43], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[16:19], s[6:7], 0x10 ; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] -; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: s_mov_b32 s36, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_cmp_lg_u32 s40, 0 -; CHECK-NEXT: s_cbranch_scc1 .LBB0_8 -; CHECK-NEXT: ; %bb.1: ; %if.end13.i.i -; CHECK-NEXT: s_cmp_eq_u32 s42, 0 -; CHECK-NEXT: s_cbranch_scc1 .LBB0_4 -; CHECK-NEXT: ; %bb.2: ; %if.else251.i.i -; CHECK-NEXT: s_cmp_lg_u32 s43, 0 -; CHECK-NEXT: s_mov_b32 s15, 0 +; CHECK-NEXT: s_cmp_lg_u32 s16, 0 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dwordx4 s[36:39], s[6:7], 0x0 +; CHECK-NEXT: s_branch .LBB0_9 +; CHECK-NEXT: .LBB0_2: ; %if.end13.i.i +; CHECK-NEXT: s_cmp_eq_u32 s18, 0 +; CHECK-NEXT: s_cbranch_scc1 .LBB0_7 +; CHECK-NEXT: ; %bb.3: ; %if.else251.i.i +; CHECK-NEXT: s_cmp_lg_u32 s19, 0 +; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9] ; CHECK-NEXT: s_cselect_b32 s8, -1, 0 +; CHECK-NEXT: s_mov_b32 s15, 0 ; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s8 -; CHECK-NEXT: s_cbranch_vccz .LBB0_5 -; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: s_mov_b32 s36, 0 -; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8 -; CHECK-NEXT: s_cbranch_vccz .LBB0_6 -; CHECK-NEXT: s_branch .LBB0_7 -; CHECK-NEXT: .LBB0_4: -; CHECK-NEXT: s_mov_b32 s10, s8 -; CHECK-NEXT: s_mov_b32 s11, s8 -; CHECK-NEXT: s_mov_b32 s9, s8 -; CHECK-NEXT: s_mov_b64 s[38:39], s[10:11] -; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] -; CHECK-NEXT: s_branch .LBB0_7 -; CHECK-NEXT: .LBB0_5: ; %if.then263.i.i -; CHECK-NEXT: v_cmp_lt_f32_e64 s8, s41, 0 +; CHECK-NEXT: s_cbranch_vccnz .LBB0_5 +; CHECK-NEXT: ; %bb.4: ; %if.then263.i.i +; CHECK-NEXT: v_cmp_lt_f32_e64 s8, s17, 0 ; CHECK-NEXT: s_mov_b32 s36, 1.0 ; CHECK-NEXT: s_mov_b32 s15, 0x7fc00000 ; CHECK-NEXT: s_mov_b32 s37, s36 ; CHECK-NEXT: s_mov_b32 s38, s36 ; CHECK-NEXT: s_mov_b32 s39, s36 +; CHECK-NEXT: .LBB0_5: ; %Flow ; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8 -; CHECK-NEXT: s_cbranch_vccnz .LBB0_7 -; CHECK-NEXT: .LBB0_6: ; %if.end273.i.i +; CHECK-NEXT: s_cbranch_vccnz .LBB0_8 +; CHECK-NEXT: ; %bb.6: ; %if.end273.i.i ; CHECK-NEXT: s_add_u32 s8, s6, 40 ; CHECK-NEXT: s_addc_u32 s9, s7, 0 ; CHECK-NEXT: s_getpc_b64 s[16:17] @@ -66,23 +58,24 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: s_mov_b64 s[6:7], s[34:35] +; CHECK-NEXT: .LBB0_7: ; %if.end294.i.i ; CHECK-NEXT: s_mov_b32 s37, s36 ; CHECK-NEXT: s_mov_b32 s38, s36 ; CHECK-NEXT: s_mov_b32 s39, s36 -; CHECK-NEXT: .LBB0_7: ; %if.end294.i.i +; CHECK-NEXT: .LBB0_8: ; %if.end294.i.i ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; CHECK-NEXT: .LBB0_8: ; %kernel_direct_lighting.exit +; CHECK-NEXT: .LBB0_9: ; %kernel_direct_lighting.exit ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x20 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s36 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, s37 ; CHECK-NEXT: v_mov_b32_e32 v2, s38 ; CHECK-NEXT: v_mov_b32_e32 v3, s39 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; CHECK-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 384715a849c1e..f93c8508e42f8 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -16,16 +16,13 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4) ; GFX90A-NEXT: renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4) - ; GFX90A-NEXT: renamable $sgpr15 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4) - ; GFX90A-NEXT: renamable $sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_LOAD_DWORDX2_IMM renamable $sgpr6_sgpr7, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4) ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr33, 0, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = S_CSELECT_B64 -1, 0, implicit killed $scc ; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr24_sgpr25 = S_XOR_B64 renamable $sgpr8_sgpr9, -1, implicit-def dead $scc ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr33, 8, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr26_sgpr27 = S_CSELECT_B64 -1, 0, implicit killed $scc - ; GFX90A-NEXT: renamable $sgpr26_sgpr27 = S_XOR_B64 killed renamable $sgpr26_sgpr27, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_CSELECT_B64 -1, 0, implicit killed $scc + ; GFX90A-NEXT: renamable $sgpr26_sgpr27 = S_XOR_B64 killed renamable $sgpr16_sgpr17, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = DS_READ_B32_gfx9 renamable $vgpr3, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) null`, align 8, addrspace 3) ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 @@ -34,7 +31,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.1.bb103: ; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.2(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr42_sgpr43, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr42_sgpr43, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr26_sgpr27, implicit-def dead $scc @@ -47,7 +44,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr22, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6, $sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr54, $sgpr55, $sgpr16_sgpr17_sgpr18, $sgpr18_sgpr19, $sgpr20_sgpr21_sgpr22, $vgpr2, $vgpr3, $vgpr10, $vgpr24, $vgpr18, $vgpr20 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr22, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6, $sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr20_sgpr21_sgpr22, $vgpr2, $vgpr3, $vgpr10, $vgpr24, $vgpr18, $vgpr20 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr23 = IMPLICIT_DEF @@ -58,17 +55,18 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr25 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.3.Flow17: + ; GFX90A-NEXT: bb.3.Flow15: ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.58(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr23, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr23, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; GFX90A-NEXT: renamable $vgpr30 = V_AND_B32_e32 1023, $vgpr31, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCZ %bb.58, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4.bb15: ; GFX90A-NEXT: successors: %bb.35(0x40000000), %bb.5(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr42_sgpr43 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr42_sgpr43 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr2_vgpr3, implicit $exec ; GFX90A-NEXT: renamable $vgpr4 = COPY renamable $sgpr17, implicit $exec @@ -85,7 +83,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr42_sgpr43 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 @@ -115,9 +113,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.6.Flow20: + ; GFX90A-NEXT: bb.6.Flow18: ; GFX90A-NEXT: successors: %bb.7(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr19 = COPY renamable $sgpr15, implicit $exec ; GFX90A-NEXT: renamable $vgpr18 = COPY $sgpr15, implicit $exec @@ -128,15 +126,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr25 = COPY $sgpr15, implicit $exec ; GFX90A-NEXT: renamable $vgpr24 = COPY $sgpr15, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.7.Flow19: + ; GFX90A-NEXT: bb.7.Flow17: ; GFX90A-NEXT: successors: %bb.63(0x40000000), %bb.8(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 ; GFX90A-NEXT: $sgpr30_sgpr31 = S_AND_SAVEEXEC_B64 $sgpr28_sgpr29, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.63, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.8.Flow32: + ; GFX90A-NEXT: bb.8.Flow30: ; GFX90A-NEXT: successors: %bb.9(0x40000000), %bb.10(0x40000000) ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} @@ -151,28 +149,28 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.10.Flow33: + ; GFX90A-NEXT: bb.10.Flow31: ; GFX90A-NEXT: successors: %bb.11(0x40000000), %bb.12(0x40000000) ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc - ; GFX90A-NEXT: $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $sgpr56_sgpr57, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $sgpr54_sgpr55, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = S_XOR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.12, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.11.bb84: ; GFX90A-NEXT: successors: %bb.12(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.12.Flow34: + ; GFX90A-NEXT: bb.12.Flow32: ; GFX90A-NEXT: successors: %bb.13(0x40000000), %bb.14(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc ; GFX90A-NEXT: $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $sgpr52_sgpr53, implicit-def $exec, implicit-def $scc, implicit $exec @@ -181,15 +179,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.13.bb79: ; GFX90A-NEXT: successors: %bb.14(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.14.Flow35: + ; GFX90A-NEXT: bb.14.Flow33: ; GFX90A-NEXT: successors: %bb.15(0x40000000), %bb.16(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc ; GFX90A-NEXT: $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $sgpr16_sgpr17, implicit-def $exec, implicit-def $scc, implicit $exec @@ -198,18 +196,18 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.15.bb72: ; GFX90A-NEXT: successors: %bb.16(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr8 = S_ADD_U32 renamable $sgpr6, 48, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr9 = S_ADDC_U32 killed renamable $sgpr7, 0, implicit-def dead $scc, implicit killed $scc ; GFX90A-NEXT: renamable $sgpr6_sgpr7 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @f2, target-flags(amdgpu-gotprel32-hi) @f2, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr6_sgpr7 = S_LOAD_DWORDX2_IMM killed renamable $sgpr6_sgpr7, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) ; GFX90A-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @f2, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit undef $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit undef $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1 - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.16.Flow36: + ; GFX90A-NEXT: bb.16.Flow34: ; GFX90A-NEXT: successors: %bb.17(0x40000000), %bb.18(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr50_sgpr51, implicit-def $exec, implicit-def $scc, implicit $exec @@ -218,15 +216,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.17.bb67: ; GFX90A-NEXT: successors: %bb.18(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.18.Flow37: + ; GFX90A-NEXT: bb.18.Flow35: ; GFX90A-NEXT: successors: %bb.19(0x40000000), %bb.20(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr48_sgpr49, implicit-def $exec, implicit-def $scc, implicit $exec @@ -235,15 +233,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.19.bb62: ; GFX90A-NEXT: successors: %bb.20(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.20.Flow38: + ; GFX90A-NEXT: bb.20.Flow36: ; GFX90A-NEXT: successors: %bb.21(0x40000000), %bb.22(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr46_sgpr47, implicit-def $exec, implicit-def $scc, implicit $exec @@ -252,15 +250,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.21.bb54: ; GFX90A-NEXT: successors: %bb.22(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.22.Flow39: + ; GFX90A-NEXT: bb.22.Flow37: ; GFX90A-NEXT: successors: %bb.23(0x40000000), %bb.24(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr44_sgpr45, implicit-def $exec, implicit-def $scc, implicit $exec @@ -269,15 +267,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.23.bb47: ; GFX90A-NEXT: successors: %bb.24(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.24.Flow40: + ; GFX90A-NEXT: bb.24.Flow38: ; GFX90A-NEXT: successors: %bb.25(0x40000000), %bb.26(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr40_sgpr41, implicit-def $exec, implicit-def $scc, implicit $exec @@ -286,15 +284,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.25.bb40: ; GFX90A-NEXT: successors: %bb.26(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.26.Flow41: + ; GFX90A-NEXT: bb.26.Flow39: ; GFX90A-NEXT: successors: %bb.27(0x40000000), %bb.28(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr38_sgpr39, implicit-def $exec, implicit-def $scc, implicit $exec @@ -303,24 +301,24 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.27.bb33: ; GFX90A-NEXT: successors: %bb.28(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.28.Flow42: + ; GFX90A-NEXT: bb.28.Flow40: ; GFX90A-NEXT: successors: %bb.34(0x40000000), %bb.29(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.34, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.29.Flow43: + ; GFX90A-NEXT: bb.29.Flow41: ; GFX90A-NEXT: successors: %bb.30(0x40000000), %bb.31(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc @@ -328,17 +326,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.30.bb19: ; GFX90A-NEXT: successors: %bb.31(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.31.Flow44: + ; GFX90A-NEXT: bb.31.Flow42: ; GFX90A-NEXT: successors: %bb.32(0x40000000), %bb.33(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr54_sgpr55, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr56_sgpr57, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr54_sgpr55, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr56_sgpr57, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.33, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.32.UnifiedUnreachableBlock: @@ -354,22 +352,22 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.34.bb26: ; GFX90A-NEXT: successors: %bb.29(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.29 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.35.bb20: ; GFX90A-NEXT: successors: %bb.37(0x40000000), %bb.36(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr42_sgpr43 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr42_sgpr43 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec :: (load (s8) from %ir.i21, addrspace 1) ; GFX90A-NEXT: renamable $vgpr42 = V_ADD_CO_U32_e32 1024, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 @@ -401,25 +399,25 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: $sgpr30_sgpr31 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.37, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.36.Flow21: + ; GFX90A-NEXT: bb.36.Flow19: ; GFX90A-NEXT: successors: %bb.6(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def $scc ; GFX90A-NEXT: S_BRANCH %bb.6 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.37.bb27: ; GFX90A-NEXT: successors: %bb.39(0x40000000), %bb.38(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr42_sgpr43, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr40_sgpr41 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr42_sgpr43, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr40_sgpr41 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 2048, 0, implicit $exec :: (load (s8) from %ir.i28, addrspace 1) ; GFX90A-NEXT: renamable $vgpr44 = V_ADD_CO_U32_e32 2048, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = COPY renamable $sgpr28_sgpr29 - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = COPY renamable $sgpr28_sgpr29 + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr45, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF @@ -440,35 +438,35 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: $sgpr36_sgpr37 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.39, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.38.Flow22: + ; GFX90A-NEXT: bb.38.Flow20: ; GFX90A-NEXT: successors: %bb.36(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_AND_B64 killed renamable $sgpr16_sgpr17, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_ANDN2_B64 killed renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_OR_B64 killed renamable $sgpr28_sgpr29, killed renamable $sgpr54_sgpr55, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_OR_B64 killed renamable $sgpr28_sgpr29, killed renamable $sgpr56_sgpr57, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.36 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.39.bb34: ; GFX90A-NEXT: successors: %bb.41(0x40000000), %bb.40(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 3072, 0, implicit $exec :: (load (s8) from %ir.i35, addrspace 1) ; GFX90A-NEXT: renamable $vgpr56 = V_ADD_CO_U32_e32 3072, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = COPY renamable $sgpr28_sgpr29 + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = COPY renamable $sgpr28_sgpr29 ; GFX90A-NEXT: renamable $vgpr57, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF @@ -490,29 +488,29 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: $sgpr38_sgpr39 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.41, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.40.Flow23: + ; GFX90A-NEXT: bb.40.Flow21: ; GFX90A-NEXT: successors: %bb.38(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr38_sgpr39, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr38_sgpr39, killed renamable $sgpr40_sgpr41, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr38_sgpr39, killed renamable $sgpr40_sgpr41, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.38 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.41.bb41: ; GFX90A-NEXT: successors: %bb.47(0x40000000), %bb.42(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = COPY $vcc @@ -520,7 +518,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr58_vgpr59, 0, 0, implicit $exec :: (load (s8) from %ir.i42, addrspace 1) ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = COPY renamable $sgpr28_sgpr29 + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = COPY renamable $sgpr28_sgpr29 ; GFX90A-NEXT: renamable $vgpr18, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF @@ -541,9 +539,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: $sgpr40_sgpr41 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.47, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.42.Flow24: + ; GFX90A-NEXT: bb.42.Flow22: ; GFX90A-NEXT: successors: %bb.40(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc ; GFX90A-NEXT: renamable $vgpr59 = COPY killed renamable $vgpr18, implicit $exec @@ -552,23 +550,23 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr16_sgpr17, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr16_sgpr17, killed renamable $sgpr54_sgpr55, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr16_sgpr17, killed renamable $sgpr56_sgpr57, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.40 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.43.bb55: ; GFX90A-NEXT: successors: %bb.49(0x40000000), %bb.44(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr44_sgpr45 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr44_sgpr45 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: S_BITCMP1_B32 killed renamable $sgpr33, 16, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_CSELECT_B64 -1, 0, implicit killed $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_CSELECT_B64 -1, 0, implicit killed $scc ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_XOR_B64 renamable $sgpr62_sgpr63, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_XOR_B64 renamable $sgpr60_sgpr61, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr46_sgpr47, implicit-def dead $scc @@ -576,7 +574,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.44: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr52_sgpr53 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = COPY renamable $sgpr28_sgpr29 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF @@ -594,31 +592,31 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.45.Flow26: + ; GFX90A-NEXT: bb.45.Flow24: ; GFX90A-NEXT: successors: %bb.46(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6, $sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18, $sgpr18_sgpr19, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr16, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr60, $vgpr61, $vgpr62, $vgpr63 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6, $sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr16_sgpr17_sgpr18, $sgpr18_sgpr19, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr16, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr60, $vgpr61, $vgpr62, $vgpr63 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.46.Flow26: + ; GFX90A-NEXT: bb.46.Flow24: ; GFX90A-NEXT: successors: %bb.48(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_XOR_B64 $exec, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr46_sgpr47, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr46_sgpr47, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.48 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.47.bb48: ; GFX90A-NEXT: successors: %bb.43(0x40000000), %bb.48(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr50_sgpr51, $sgpr44_sgpr45 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr50_sgpr51, $sgpr44_sgpr45 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = COPY $vcc @@ -626,12 +624,12 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr1, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE killed renamable $vgpr0_vgpr1, 1024, 0, implicit $exec :: (load (s8) from %ir.i49, addrspace 1) ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr28_sgpr29 - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr28_sgpr29 + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr61, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr16_sgpr17, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec - ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF @@ -649,34 +647,34 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: $sgpr16_sgpr17 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.43, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.48.Flow25: + ; GFX90A-NEXT: bb.48.Flow23: ; GFX90A-NEXT: successors: %bb.42(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr16_sgpr17, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr42_sgpr43, killed renamable $sgpr54_sgpr55, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr42_sgpr43, killed renamable $sgpr56_sgpr57, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.42 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.49.bb63: ; GFX90A-NEXT: successors: %bb.51(0x40000000), %bb.50(0x40000000) - ; GFX90A-NEXT: liveins: $vcc, $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr46_sgpr47, $sgpr54_sgpr55:0x000000000000000F, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53 + ; GFX90A-NEXT: liveins: $vcc, $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr46_sgpr47, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr52_sgpr53 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.51, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.50: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr52_sgpr53 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = COPY renamable $sgpr28_sgpr29 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF @@ -696,7 +694,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.51.bb68: ; GFX90A-NEXT: successors: %bb.55(0x40000000), %bb.52(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr54_sgpr55:0x000000000000000F, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr52_sgpr53 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr30, implicit $exec ; GFX90A-NEXT: renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec @@ -706,7 +704,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.52: ; GFX90A-NEXT: successors: %bb.46(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr52_sgpr53 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = COPY renamable $sgpr28_sgpr29 @@ -726,7 +724,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.53.bb80: ; GFX90A-NEXT: successors: %bb.60(0x40000000), %bb.54(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr54_sgpr55:0x000000000000000F, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr15 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc ; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr15, 0, implicit-def $scc @@ -736,11 +734,11 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.54: ; GFX90A-NEXT: successors: %bb.62(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr58_sgpr59, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr28_sgpr29 + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = COPY renamable $sgpr28_sgpr29 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF @@ -755,16 +753,16 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.55.bb73: ; GFX90A-NEXT: successors: %bb.53(0x40000000), %bb.56(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr54_sgpr55:0x000000000000000F, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr6 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, addrspace 1) ; GFX90A-NEXT: renamable $vgpr4 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = COPY renamable $sgpr28_sgpr29 - ; GFX90A-NEXT: renamable $vgpr5, dead renamable $sgpr56_sgpr57 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr5, dead renamable $sgpr54_sgpr55 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr6, implicit $exec - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF @@ -776,34 +774,35 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: $sgpr58_sgpr59 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr56_sgpr57 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.53, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.56.Flow29: + ; GFX90A-NEXT: bb.56.Flow27: ; GFX90A-NEXT: successors: %bb.46(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr58_sgpr59, implicit-def $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr56_sgpr57, implicit-def $scc ; GFX90A-NEXT: S_BRANCH %bb.46 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.57.bb90: ; GFX90A-NEXT: successors: %bb.61(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr54_sgpr55:0x000000000000000F, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr62_sgpr63, implicit $exec + ; GFX90A-NEXT: renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr60_sgpr61, implicit $exec + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_LOAD_DWORDX2_IMM renamable $sgpr6_sgpr7, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg2.kernarg.offset, align 16, addrspace 4) ; GFX90A-NEXT: renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr21, implicit $exec ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr22, implicit $exec ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr54, implicit $exec - ; GFX90A-NEXT: renamable $vgpr11 = V_ALIGNBIT_B32_e64 killed $sgpr55, killed $vgpr10, 1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr48, implicit $exec + ; GFX90A-NEXT: renamable $vgpr11 = V_ALIGNBIT_B32_e64 killed $sgpr49, killed $vgpr10, 1, implicit $exec ; GFX90A-NEXT: renamable $vgpr52 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec ; GFX90A-NEXT: renamable $vgpr17 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr8_sgpr9, implicit $exec ; GFX90A-NEXT: renamable $vgpr15 = V_ALIGNBIT_B32_e64 $vgpr15, $vgpr14, 1, implicit $exec ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.61 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.58: @@ -812,7 +811,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr15 = COPY killed renamable $sgpr23, implicit $exec ; GFX90A-NEXT: renamable $vgpr17 = COPY killed renamable $sgpr15, implicit $exec - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr16_sgpr17 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 @@ -845,8 +844,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.59.bb105: ; GFX90A-NEXT: successors: %bb.3(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $sgpr15 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg10.kernarg.offset, align 8, addrspace 4) ; GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr23, implicit $exec @@ -854,7 +854,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr21, implicit $exec ; GFX90A-NEXT: renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr15, implicit $exec - ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.435, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.arg10.load, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr22, implicit $exec ; GFX90A-NEXT: renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_MOV_B64 -1 @@ -864,7 +864,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.60.bb85: ; GFX90A-NEXT: successors: %bb.57(0x40000000), %bb.61(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr54_sgpr55:0x000000000000000F, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr8 = V_OR_B32_e32 1, $vgpr6, implicit $exec ; GFX90A-NEXT: renamable $vgpr9 = COPY renamable $vgpr7, implicit $exec @@ -872,7 +872,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr15 = S_MOV_B32 0 ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr10, implicit $exec - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr28_sgpr29 + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = COPY renamable $sgpr28_sgpr29 ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF @@ -884,67 +884,67 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: $sgpr50_sgpr51 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.57, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.61.Flow31: + ; GFX90A-NEXT: bb.61.Flow29: ; GFX90A-NEXT: successors: %bb.62(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr50_sgpr51, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $vgpr14, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.62.Flow30: + ; GFX90A-NEXT: bb.62.Flow28: ; GFX90A-NEXT: successors: %bb.56(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr48_sgpr49, killed renamable $sgpr54_sgpr55, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr48_sgpr49, killed renamable $sgpr58_sgpr59, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.56 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.63.bb140: ; GFX90A-NEXT: successors: %bb.69(0x40000000), %bb.64(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr26_sgpr27, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.69, implicit $vcc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.64.Flow13: + ; GFX90A-NEXT: bb.64.Flow11: ; GFX90A-NEXT: successors: %bb.65(0x40000000), %bb.67(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr28_sgpr29, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.67, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.65.bb159: ; GFX90A-NEXT: successors: %bb.68(0x40000000), %bb.66(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr30, implicit $exec ; GFX90A-NEXT: $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr8_sgpr9 = S_XOR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.68, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.66.Flow10: + ; GFX90A-NEXT: bb.66.Flow8: ; GFX90A-NEXT: successors: %bb.67(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $sgpr8_sgpr9 = S_ANDN2_SAVEEXEC_B64 $sgpr8_sgpr9, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.67.Flow14: + ; GFX90A-NEXT: bb.67.Flow12: ; GFX90A-NEXT: successors: %bb.8(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = COPY $exec ; GFX90A-NEXT: S_BRANCH %bb.8 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.68.bb161: ; GFX90A-NEXT: successors: %bb.66(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr21, killed $vgpr23, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr25, implicit $exec @@ -963,7 +963,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.69.bb174: ; GFX90A-NEXT: successors: %bb.73(0x40000000), %bb.70(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr26 = V_OR_B32_e32 1, $vgpr24, implicit $exec ; GFX90A-NEXT: renamable $vgpr48 = V_OR_B32_e32 $vgpr26, $vgpr22, implicit $exec @@ -979,14 +979,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.70.Flow: ; GFX90A-NEXT: successors: %bb.71(0x40000000), %bb.72(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.72, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.71.bb186: ; GFX90A-NEXT: successors: %bb.72(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr2_vgpr3 = V_LSHLREV_B64_e64 3, killed $vgpr2_vgpr3, implicit $exec ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr19, implicit $exec @@ -1013,16 +1013,16 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.72.Flow9: + ; GFX90A-NEXT: bb.72.Flow7: ; GFX90A-NEXT: successors: %bb.64(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_MOV_B64 0 ; GFX90A-NEXT: S_BRANCH %bb.64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.73.bb196: ; GFX90A-NEXT: successors: %bb.70(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 $vgpr50, killed $vgpr16, implicit $exec ; GFX90A-NEXT: renamable $vgpr54 = V_OR_B32_e32 killed $vgpr10, killed $vgpr14, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll index e9dbce9026ca0..cdcc8915da23d 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -164,32 +164,32 @@ define amdgpu_kernel void @test_copy_v4i8_x4(ptr addrspace(1) %out0, ptr addrspa ; SI-LABEL: test_copy_v4i8_x4: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x11 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: s_mov_b32 s18, s2 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s19, s3 -; SI-NEXT: s_mov_b32 s22, s2 -; SI-NEXT: s_mov_b32 s23, s3 -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s16, s8 -; SI-NEXT: s_mov_b32 s17, s9 -; SI-NEXT: s_mov_b32 s20, s10 -; SI-NEXT: s_mov_b32 s21, s11 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s22, s10 +; SI-NEXT: s_mov_b32 s23, s11 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s20, s6 +; SI-NEXT: s_mov_b32 s21, s7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; SI-NEXT: buffer_store_dword v0, off, s[20:23], 0 @@ -326,23 +326,23 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(ptr addrspace(1) %out0, ptr define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_copy_v4i8_x2_extra_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 -; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s15, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: s_mov_b64 s[12:13], s[10:11] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0 @@ -358,9 +358,9 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(ptr addrspace(1) %out0, p ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_x2_extra_use: diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll index 131ce14a7847c..d199a03a7ddd0 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll @@ -334,58 +334,59 @@ define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %ctpop_arg, i32 %cond) { ; SI-LABEL: ctpop_i64_in_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s8, s[0:1], 0xf +; SI-NEXT: s_load_dword s2, s[0:1], 0xf ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s8, 0 +; SI-NEXT: s_cmp_lg_u32 s2, 0 ; SI-NEXT: s_cbranch_scc0 .LBB7_4 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2 +; SI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x2 ; SI-NEXT: s_mov_b64 s[6:7], 0 ; SI-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc ; SI-NEXT: s_cbranch_vccnz .LBB7_3 ; SI-NEXT: .LBB7_2: ; %if -; SI-NEXT: s_bcnt1_i32_b64 s0, s[2:3] -; SI-NEXT: s_mov_b32 s1, 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bcnt1_i32_b64 s2, s[0:1] ; SI-NEXT: .LBB7_3: ; %endif -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB7_4: -; SI-NEXT: ; implicit-def: $sgpr0_sgpr1 +; SI-NEXT: ; implicit-def: $sgpr2_sgpr3 ; SI-NEXT: s_branch .LBB7_2 ; ; VI-LABEL: ctpop_i64_in_br: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s8, s[0:1], 0x3c +; VI-NEXT: s_load_dword s2, s[0:1], 0x3c ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s8, 0 +; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cbranch_scc0 .LBB7_4 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x8 ; VI-NEXT: s_cbranch_execnz .LBB7_3 ; VI-NEXT: .LBB7_2: ; %if +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bcnt1_i32_b64 s0, s[2:3] -; VI-NEXT: s_mov_b32 s1, 0 +; VI-NEXT: s_mov_b32 s3, 0 +; VI-NEXT: s_bcnt1_i32_b64 s2, s[0:1] ; VI-NEXT: .LBB7_3: ; %endif ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB7_4: -; VI-NEXT: ; implicit-def: $sgpr0_sgpr1 +; VI-NEXT: ; implicit-def: $sgpr2_sgpr3 ; VI-NEXT: s_branch .LBB7_2 entry: %tmp0 = icmp eq i32 %cond, 0 diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll index 53acbb6a7bceb..5af5006db2b8f 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll @@ -6,14 +6,14 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, ptr %arg6, ptr %arg7, ptr %arg8, ptr %arg9) { ; CHECK-LABEL: eggs: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 -; CHECK-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x8 +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_bitcmp0_b32 s0, 0 +; CHECK-NEXT: s_bitcmp0_b32 s6, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %bb10 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: global_load_dwordx2 v[0:1], v0, s[8:9] +; CHECK-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_lshrrev_b32_e32 v7, 8, v0 ; CHECK-NEXT: v_lshrrev_b32_e32 v6, 16, v0 @@ -32,21 +32,24 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr ; CHECK-NEXT: v_mov_b32_e32 v7, 0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: .LBB0_3: ; %bb41 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x48 -; CHECK-NEXT: v_mov_b32_e32 v8, s10 -; CHECK-NEXT: v_mov_b32_e32 v9, s11 -; CHECK-NEXT: v_mov_b32_e32 v10, s12 -; CHECK-NEXT: v_mov_b32_e32 v11, s13 -; CHECK-NEXT: v_mov_b32_e32 v12, s14 -; CHECK-NEXT: v_mov_b32_e32 v13, s15 -; CHECK-NEXT: v_mov_b32_e32 v14, s16 -; CHECK-NEXT: v_mov_b32_e32 v15, s17 -; CHECK-NEXT: v_mov_b32_e32 v16, s18 -; CHECK-NEXT: v_mov_b32_e32 v17, s19 -; CHECK-NEXT: v_mov_b32_e32 v18, s20 -; CHECK-NEXT: v_mov_b32_e32 v19, s21 -; CHECK-NEXT: v_mov_b32_e32 v20, s22 -; CHECK-NEXT: v_mov_b32_e32 v21, s23 +; CHECK-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x18 +; CHECK-NEXT: v_mov_b32_e32 v8, s2 +; CHECK-NEXT: v_mov_b32_e32 v9, s3 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x38 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x48 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v10, s8 +; CHECK-NEXT: v_mov_b32_e32 v11, s9 +; CHECK-NEXT: v_mov_b32_e32 v12, s10 +; CHECK-NEXT: v_mov_b32_e32 v13, s11 +; CHECK-NEXT: v_mov_b32_e32 v14, s12 +; CHECK-NEXT: v_mov_b32_e32 v15, s13 +; CHECK-NEXT: v_mov_b32_e32 v16, s14 +; CHECK-NEXT: v_mov_b32_e32 v17, s15 +; CHECK-NEXT: v_mov_b32_e32 v18, s0 +; CHECK-NEXT: v_mov_b32_e32 v19, s1 +; CHECK-NEXT: v_mov_b32_e32 v20, s2 +; CHECK-NEXT: v_mov_b32_e32 v21, s3 ; CHECK-NEXT: flat_store_byte v[8:9], v0 ; CHECK-NEXT: flat_store_byte v[10:11], v7 ; CHECK-NEXT: flat_store_byte v[12:13], v6 @@ -54,8 +57,7 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr ; CHECK-NEXT: flat_store_byte v[16:17], v1 ; CHECK-NEXT: flat_store_byte v[18:19], v4 ; CHECK-NEXT: flat_store_byte v[20:21], v3 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; CHECK-NEXT: flat_store_byte v[0:1], v2 ; CHECK-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll index b5fa3fd9eccc1..6021299a858b8 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll @@ -693,70 +693,82 @@ define amdgpu_kernel void @s_test_copysign_v2f64(ptr addrspace(1) %out, <2 x dou define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x double> %mag, <3 x double> %sign) { ; SI-LABEL: s_test_copysign_v3f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x15 +; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x19 +; SI-NEXT: s_load_dword s0, s[0:1], 0x1e +; SI-NEXT: s_brev_b32 s1, -2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s10, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v0, s11 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, s15 -; SI-NEXT: v_bfi_b32 v3, s10, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: v_bfi_b32 v1, s10, v0, v1 +; SI-NEXT: v_bfi_b32 v3, s1, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_bfi_b32 v5, s10, v0, v2 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_bfi_b32 v1, s1, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_bfi_b32 v5, s1, v0, v2 +; SI-NEXT: v_mov_b32_e32 v4, s2 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v3f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x64 +; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x54 +; VI-NEXT: s_load_dword s0, s[0:1], 0x78 +; VI-NEXT: s_brev_b32 s1, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v1, s15 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_bfi_b32 v3, s2, v0, v1 +; VI-NEXT: v_mov_b32_e32 v1, s11 +; VI-NEXT: v_bfi_b32 v3, s1, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_bfi_b32 v1, s1, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s13 -; VI-NEXT: v_bfi_b32 v1, s2, v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_mov_b32_e32 v2, s17 -; VI-NEXT: v_bfi_b32 v5, s2, v0, v2 -; VI-NEXT: s_add_u32 s2, s0, 16 -; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v7, s3 -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v6, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_add_u32 s0, s2, 16 +; VI-NEXT: v_bfi_b32 v5, s1, v0, v2 +; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_mov_b32_e32 v4, s12 +; VI-NEXT: v_mov_b32_e32 v6, s0 ; VI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] -; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_v3f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x44 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x64 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x78 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x54 +; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x44 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v1, s15 -; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v0, s4 -; GFX11-NEXT: v_dual_mov_b32 v7, s13 :: v_dual_mov_b32 v4, s8 -; GFX11-NEXT: v_mov_b32_e32 v2, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, s9, v5 -; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s7, v1 -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v7 +; GFX11-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, s4 +; GFX11-NEXT: v_mov_b32_e32 v4, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: v_mov_b32_e32 v7, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, s3, v5 +; GFX11-NEXT: v_mov_b32_e32 v2, s10 +; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s11, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s9, v7 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] @@ -771,45 +783,46 @@ define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x dou define amdgpu_kernel void @s_test_copysign_v4f64(ptr addrspace(1) %out, <4 x double> %mag, <4 x double> %sign) { ; SI-LABEL: s_test_copysign_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[20:21], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0x19 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: s_mov_b32 s23, 0xf000 +; SI-NEXT: s_mov_b32 s22, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s12, -2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: v_mov_b32_e32 v1, s15 -; SI-NEXT: v_bfi_b32 v3, s12, v0, v1 +; SI-NEXT: v_bfi_b32 v3, s0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: v_bfi_b32 v1, s12, v0, v1 +; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s11 ; SI-NEXT: v_mov_b32_e32 v2, s19 -; SI-NEXT: v_bfi_b32 v7, s12, v0, v2 +; SI-NEXT: v_bfi_b32 v7, s0, v0, v2 ; SI-NEXT: v_mov_b32_e32 v0, s9 ; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_bfi_b32 v5, s12, v0, v2 +; SI-NEXT: v_bfi_b32 v5, s0, v0, v2 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v4f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0x64 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: v_mov_b32_e32 v1, s15 -; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: v_bfi_b32 v3, s2, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s13 -; VI-NEXT: v_bfi_b32 v1, s2, v2, v0 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v1, s13 +; VI-NEXT: v_bfi_b32 v1, s2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: v_mov_b32_e32 v2, s19 ; VI-NEXT: v_bfi_b32 v7, s2, v0, v2 @@ -832,21 +845,22 @@ define amdgpu_kernel void @s_test_copysign_v4f64(ptr addrspace(1) %out, <4 x dou ; ; GFX11-LABEL: s_test_copysign_v4f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x44 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x64 +; GFX11-NEXT: s_load_b256 s[12:19], s[0:1], 0x44 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s15 -; GFX11-NEXT: v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v2, s10 -; GFX11-NEXT: v_dual_mov_b32 v9, s17 :: v_dual_mov_b32 v4, s4 -; GFX11-NEXT: v_dual_mov_b32 v5, s13 :: v_dual_mov_b32 v0, s8 +; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: v_dual_mov_b32 v3, s11 :: v_dual_mov_b32 v2, s18 +; GFX11-NEXT: v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v4, s12 +; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v0, s16 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, s7, v1 -; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s11, v3 +; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, s15, v1 +; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s19, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s9, v9 -; GFX11-NEXT: v_mov_b32_e32 v6, s6 -; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, s5, v5 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s17, v9 +; GFX11-NEXT: v_mov_b32_e32 v6, s14 +; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, s13, v5 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll index b8c8d993d389b..325357cb3844a 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -53,39 +53,39 @@ entry: define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_add_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_add_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -96,7 +96,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -286,9 +286,9 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -303,9 +303,9 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -321,7 +321,7 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -501,39 +501,39 @@ entry: define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_and_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_and_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -544,7 +544,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -734,9 +734,9 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -751,9 +751,9 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -769,7 +769,7 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -949,39 +949,39 @@ entry: define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_sub_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_sub_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -992,7 +992,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -1182,9 +1182,9 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -1199,9 +1199,9 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -1217,7 +1217,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -1395,38 +1395,38 @@ entry: define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_max_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -1438,7 +1438,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -1624,9 +1624,9 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v2, s6 @@ -1641,9 +1641,9 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v2, s6 @@ -1659,7 +1659,7 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -1835,38 +1835,38 @@ entry: define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_umax_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -1878,7 +1878,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -2064,9 +2064,9 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v2, s6 @@ -2081,9 +2081,9 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v2, s6 @@ -2099,7 +2099,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -2275,38 +2275,38 @@ entry: define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_min_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -2318,7 +2318,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -2504,9 +2504,9 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v2, s6 @@ -2521,9 +2521,9 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v2, s6 @@ -2539,7 +2539,7 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -2715,38 +2715,38 @@ entry: define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_umin_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -2758,7 +2758,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -2944,9 +2944,9 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v2, s6 @@ -2961,9 +2961,9 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v2, s6 @@ -2979,7 +2979,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -3157,39 +3157,39 @@ entry: define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_or_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_or_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -3200,7 +3200,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -3390,9 +3390,9 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3407,9 +3407,9 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3425,7 +3425,7 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -3699,39 +3699,39 @@ entry: define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_xchg_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -3742,7 +3742,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -3932,9 +3932,9 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -3949,9 +3949,9 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3967,7 +3967,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -4147,39 +4147,39 @@ entry: define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_xor_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xor_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -4190,7 +4190,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -4380,9 +4380,9 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -4397,9 +4397,9 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4415,7 +4415,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -5070,10 +5070,10 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -5090,10 +5090,10 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -5106,8 +5106,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5315,9 +5315,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 @@ -5333,9 +5333,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, ; GCN2: ; %bb.0: ; %entry ; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 @@ -5351,8 +5351,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 ; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5947,39 +5947,39 @@ entry: define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_inc_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_inc_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -5990,7 +5990,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -6180,9 +6180,9 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -6197,9 +6197,9 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -6215,7 +6215,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -6395,39 +6395,39 @@ entry: define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_dec_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -6438,7 +6438,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -6628,9 +6628,9 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -6645,9 +6645,9 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -6663,7 +6663,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll index bac2d8b8b40c2..af9abf3f2a8d0 100644 --- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll @@ -2337,14 +2337,14 @@ define amdgpu_kernel void @fma_neg_b_c_v4f32(ptr addrspace(1) %out, ptr addrspac ; GFX11-NEXT: v_lshlrev_b32_e32 v12, 4, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_load_b128 v[0:3], v12, s[2:3] offset:16 -; GFX11-NEXT: global_load_b128 v[4:7], v12, s[2:3] +; GFX11-NEXT: global_load_b128 v[0:3], v12, s[2:3] +; GFX11-NEXT: global_load_b128 v[4:7], v12, s[2:3] offset:16 ; GFX11-NEXT: global_load_b128 v[8:11], v12, s[2:3] offset:48 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_fma_f32 v3, v11, -v7, -v3 -; GFX11-NEXT: v_fma_f32 v2, v10, -v6, -v2 -; GFX11-NEXT: v_fma_f32 v1, v9, -v5, -v1 -; GFX11-NEXT: v_fma_f32 v0, v8, -v4, -v0 +; GFX11-NEXT: v_fma_f32 v3, v11, -v3, -v7 +; GFX11-NEXT: v_fma_f32 v2, v10, -v2, -v6 +; GFX11-NEXT: v_fma_f32 v1, v9, -v1, -v5 +; GFX11-NEXT: v_fma_f32 v0, v8, -v0, -v4 ; GFX11-NEXT: global_store_b128 v12, v[0:3], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index 0d59021b69019..ef04493b53a70 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -106,11 +106,11 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; VI-NEXT: v_cvt_f32_f16_e32 v5, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; VI-NEXT: v_rcp_f32_e32 v5, v5 -; VI-NEXT: v_mul_f32_e32 v3, v3, v5 +; VI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; VI-NEXT: v_rcp_f32_e32 v3, v3 +; VI-NEXT: v_mul_f32_e32 v3, v5, v3 ; VI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; VI-NEXT: v_div_fixup_f16 v3, v3, v2, v4 ; VI-NEXT: v_trunc_f16_e32 v3, v3 @@ -1221,9 +1221,9 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -1465,19 +1465,20 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] ; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7] ; VI-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3] ; VI-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9] @@ -1690,19 +1691,20 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] ; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7] ; VI-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3] ; VI-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9] @@ -1969,22 +1971,22 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; VI-NEXT: v_cvt_f32_f16_e32 v5, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; VI-NEXT: v_rcp_f32_e32 v7, v7 -; VI-NEXT: v_mul_f32_e32 v5, v5, v7 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_cvt_f32_f16_e32 v5, v3 +; VI-NEXT: v_rcp_f32_e32 v5, v5 +; VI-NEXT: v_mul_f32_e32 v5, v7, v5 ; VI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; VI-NEXT: v_div_fixup_f16 v5, v5, v6, v3 +; VI-NEXT: v_div_fixup_f16 v5, v5, v3, v6 ; VI-NEXT: v_trunc_f16_e32 v5, v5 -; VI-NEXT: v_fma_f16 v3, -v5, v6, v3 -; VI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; VI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; VI-NEXT: v_fma_f16 v3, -v5, v3, v6 +; VI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; VI-NEXT: v_cvt_f32_f16_e32 v6, v4 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_rcp_f32_e32 v6, v6 -; VI-NEXT: v_mul_f32_e32 v5, v5, v6 +; VI-NEXT: v_rcp_f32_e32 v5, v5 +; VI-NEXT: v_mul_f32_e32 v5, v6, v5 ; VI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; VI-NEXT: v_div_fixup_f16 v5, v5, v2, v4 ; VI-NEXT: v_trunc_f16_e32 v5, v5 @@ -2359,42 +2361,42 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; VI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 ; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; VI-NEXT: v_rcp_f32_e32 v9, v9 -; VI-NEXT: v_mul_f32_e32 v7, v7, v9 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; VI-NEXT: v_cvt_f32_f16_e32 v9, v8 +; VI-NEXT: v_rcp_f32_e32 v7, v7 +; VI-NEXT: v_mul_f32_e32 v7, v9, v7 ; VI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; VI-NEXT: v_div_fixup_f16 v7, v7, v8, v6 +; VI-NEXT: v_div_fixup_f16 v7, v7, v6, v8 ; VI-NEXT: v_trunc_f16_e32 v7, v7 -; VI-NEXT: v_fma_f16 v6, -v7, v8, v6 -; VI-NEXT: v_cvt_f32_f16_e32 v8, v5 -; VI-NEXT: v_cvt_f32_f16_e32 v7, v3 +; VI-NEXT: v_fma_f16 v6, -v7, v6, v8 +; VI-NEXT: v_cvt_f32_f16_e32 v7, v5 +; VI-NEXT: v_cvt_f32_f16_e32 v8, v3 ; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_rcp_f32_e32 v8, v8 -; VI-NEXT: v_mul_f32_e32 v7, v7, v8 +; VI-NEXT: v_rcp_f32_e32 v7, v7 +; VI-NEXT: v_mul_f32_e32 v7, v8, v7 ; VI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; VI-NEXT: v_div_fixup_f16 v7, v7, v5, v3 ; VI-NEXT: v_trunc_f16_e32 v7, v7 ; VI-NEXT: v_fma_f16 v3, -v7, v5, v3 -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; VI-NEXT: v_cvt_f32_f16_e32 v8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; VI-NEXT: v_or_b32_e32 v3, v3, v6 ; VI-NEXT: v_cvt_f32_f16_e32 v6, v5 -; VI-NEXT: v_rcp_f32_e32 v8, v8 -; VI-NEXT: v_mul_f32_e32 v6, v6, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; VI-NEXT: v_cvt_f32_f16_e32 v8, v7 +; VI-NEXT: v_rcp_f32_e32 v6, v6 +; VI-NEXT: v_mul_f32_e32 v6, v8, v6 ; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; VI-NEXT: v_div_fixup_f16 v6, v6, v7, v5 +; VI-NEXT: v_div_fixup_f16 v6, v6, v5, v7 ; VI-NEXT: v_trunc_f16_e32 v6, v6 -; VI-NEXT: v_fma_f16 v5, -v6, v7, v5 -; VI-NEXT: v_cvt_f32_f16_e32 v7, v4 -; VI-NEXT: v_cvt_f32_f16_e32 v6, v2 +; VI-NEXT: v_fma_f16 v5, -v6, v5, v7 +; VI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; VI-NEXT: v_cvt_f32_f16_e32 v7, v2 ; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_rcp_f32_e32 v7, v7 -; VI-NEXT: v_mul_f32_e32 v6, v6, v7 +; VI-NEXT: v_rcp_f32_e32 v6, v6 +; VI-NEXT: v_mul_f32_e32 v6, v7, v6 ; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; VI-NEXT: v_div_fixup_f16 v6, v6, v4, v2 ; VI-NEXT: v_trunc_f16_e32 v6, v6 @@ -2560,55 +2562,55 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: global_load_b64 v[0:1], v4, s[6:7] ; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32 ; GFX1150-NEXT: s_waitcnt vmcnt(1) -; GFX1150-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX1150-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v5 ; GFX1150-NEXT: v_rcp_f32_e32 v6, v6 ; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_mixlo_f16 v6, v0, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX1150-NEXT: v_fma_mixlo_f16 v6, v1, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX1150-NEXT: v_div_fixup_f16 v6, v6, v5, v7 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_trunc_f16_e32 v6, v6 ; GFX1150-NEXT: v_xor_b32_e32 v6, 0x8000, v6 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_fmac_f16_e32 v7, v6, v5 -; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX1150-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX1150-NEXT: v_rcp_f32_e32 v5, v5 ; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_mixlo_f16 v5, v0, v5, 0 op_sel_hi:[1,0,0] -; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v2, v0 +; GFX1150-NEXT: v_fma_mixlo_f16 v5, v1, v5, 0 op_sel_hi:[1,0,0] +; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v3, v1 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_trunc_f16_e32 v5, v5 ; GFX1150-NEXT: v_xor_b32_e32 v5, 0x8000, v5 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1150-NEXT: v_fma_f16 v0, v5, v2, v0 -; GFX1150-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX1150-NEXT: v_pack_b32_f16 v0, v0, v7 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f16 v1, v5, v3, v1 +; GFX1150-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v7 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-NEXT: v_rcp_f32_e32 v3, v3 +; GFX1150-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_div_fixup_f16 v3, v3, v2, v0 +; GFX1150-NEXT: v_trunc_f16_e32 v3, v3 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1150-NEXT: v_fma_f16 v3, v3, v2, v0 +; GFX1150-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX1150-NEXT: v_rcp_f32_e32 v5, v5 -; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_mixlo_f16 v5, v1, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v2, v6 +; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_mixlo_f16 v5, v0, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX1150-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v2, v0 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_trunc_f16_e32 v5, v5 ; GFX1150-NEXT: v_xor_b32_e32 v5, 0x8000, v5 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fmac_f16_e32 v6, v5, v2 -; GFX1150-NEXT: v_cvt_f32_f16_e32 v2, v3 -; GFX1150-NEXT: v_rcp_f32_e32 v2, v2 -; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_mixlo_f16 v2, v1, v2, 0 op_sel_hi:[1,0,0] -; GFX1150-NEXT: v_div_fixup_f16 v2, v2, v3, v1 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_trunc_f16_e32 v2, v2 -; GFX1150-NEXT: v_xor_b32_e32 v2, 0x8000, v2 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fmac_f16_e32 v1, v2, v3 -; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v6 +; GFX1150-NEXT: v_fmac_f16_e32 v0, v5, v2 +; GFX1150-NEXT: v_pack_b32_f16 v0, v3, v0 ; GFX1150-NEXT: global_store_b64 v4, v[0:1], s[4:5] ; GFX1150-NEXT: s_nop 0 ; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll index f5dbaaff9cf88..18571e1ea2ffa 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -68,11 +68,11 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol @@ -87,11 +87,11 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -117,9 +117,9 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -361,8 +361,8 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspa ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v0, s8 +; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[0:3], 0 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -380,8 +380,8 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspa ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -410,9 +410,9 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -644,11 +644,11 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol @@ -663,11 +663,11 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -693,9 +693,9 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -937,8 +937,8 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspa ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v0, s8 +; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[0:3], 0 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -956,8 +956,8 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspa ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -986,9 +986,9 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -1220,11 +1220,11 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol @@ -1239,11 +1239,11 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -1269,9 +1269,9 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -1513,8 +1513,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspa ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v0, s8 +; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[0:3], 0 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1532,8 +1532,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspa ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1562,9 +1562,9 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -1790,11 +1790,11 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1808,11 +1808,11 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1836,9 +1836,9 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -2065,8 +2065,8 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspa ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v0, s8 +; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[0:3], 0 glc ; CI-NEXT: s_mov_b32 s0, s6 @@ -2083,8 +2083,8 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspa ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[0:3], 0 glc ; VI-NEXT: s_mov_b32 s0, s6 @@ -2111,9 +2111,9 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -2330,11 +2330,11 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -2348,11 +2348,11 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -2376,9 +2376,9 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -2605,8 +2605,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v0, s8 +; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[0:3], 0 glc ; CI-NEXT: s_mov_b32 s0, s6 @@ -2623,8 +2623,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrsp ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[0:3], 0 glc ; VI-NEXT: s_mov_b32 s0, s6 @@ -2651,9 +2651,9 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -2870,11 +2870,11 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -2888,11 +2888,11 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -2916,9 +2916,9 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -3145,8 +3145,8 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspa ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v0, s8 +; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[0:3], 0 glc ; CI-NEXT: s_mov_b32 s0, s6 @@ -3163,8 +3163,8 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspa ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[0:3], 0 glc ; VI-NEXT: s_mov_b32 s0, s6 @@ -3191,9 +3191,9 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -3410,11 +3410,11 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -3428,11 +3428,11 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -3456,9 +3456,9 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -3685,8 +3685,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v0, s8 +; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[0:3], 0 glc ; CI-NEXT: s_mov_b32 s0, s6 @@ -3703,8 +3703,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrsp ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[0:3], 0 glc ; VI-NEXT: s_mov_b32 s0, s6 @@ -3731,9 +3731,9 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SE @@ -3956,11 +3956,11 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr a ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol @@ -3975,11 +3975,11 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4005,9 +4005,9 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr a ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -4249,8 +4249,8 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspac ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v0, s8 +; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[0:3], 0 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -4268,8 +4268,8 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspac ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -4298,9 +4298,9 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspac ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -4642,11 +4642,11 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol @@ -4661,11 +4661,11 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4691,9 +4691,9 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -4935,8 +4935,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v0, s8 +; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -4954,8 +4954,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrsp ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -4984,9 +4984,9 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -5218,11 +5218,11 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol @@ -5237,11 +5237,11 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -5267,9 +5267,9 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -5511,8 +5511,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspa ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v0, s8 +; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[0:3], 0 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -5530,8 +5530,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspa ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5560,9 +5560,9 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -5881,11 +5881,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out, ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, s10 ; CI-NEXT: s_mov_b32 s3, s11 -; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: v_mov_b32_e32 v2, s6 ; CI-NEXT: v_mov_b32_e32 v3, s7 @@ -5901,11 +5901,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out, ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b32 s8, s2 ; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: s_mov_b32 s2, s10 ; VI-NEXT: s_mov_b32 s3, s11 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -5933,10 +5933,10 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out, ; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 -; GFX12-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -6205,9 +6205,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: s_mov_b32 s8, s0 ; CI-NEXT: s_mov_b32 s9, s1 -; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: v_mov_b32_e32 v2, s6 ; CI-NEXT: v_mov_b32_e32 v3, s7 @@ -6225,9 +6225,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -6257,10 +6257,10 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add ; GFX12-LABEL: atomic_cmpxchg_i64_ret: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 -; GFX12-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -7259,11 +7259,11 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol @@ -7278,11 +7278,11 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -7308,9 +7308,9 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV @@ -7464,11 +7464,11 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol @@ -7483,11 +7483,11 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -7513,9 +7513,9 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: global_atomic_dec_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll index 7a9f4ae8a20fa..e13a61e3a3295 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll @@ -17,7 +17,7 @@ ; CHECK: .private_segment_fixed_size: 0 ; CHECK: .sgpr_count: 6 ; CHECK: .symbol: test.kd -; CHECK: .vgpr_count: {{3|6}} +; CHECK: .vgpr_count: {{3|5}} ; WAVE64: .wavefront_size: 64 ; WAVE32: .wavefront_size: 32 define amdgpu_kernel void @test( diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 297b5180dfe9b..312ca2d1b0ad5 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -6,322 +6,110 @@ define void @main(i1 %arg) #0 { ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: v_writelane_b32 v8, s30, 0 -; CHECK-NEXT: v_writelane_b32 v8, s31, 1 -; CHECK-NEXT: v_writelane_b32 v8, s36, 2 -; CHECK-NEXT: v_writelane_b32 v8, s37, 3 -; CHECK-NEXT: v_writelane_b32 v8, s38, 4 -; CHECK-NEXT: v_writelane_b32 v8, s39, 5 -; CHECK-NEXT: v_writelane_b32 v8, s40, 6 -; CHECK-NEXT: v_writelane_b32 v8, s41, 7 -; CHECK-NEXT: v_writelane_b32 v8, s42, 8 -; CHECK-NEXT: v_writelane_b32 v8, s43, 9 -; CHECK-NEXT: v_writelane_b32 v8, s44, 10 -; CHECK-NEXT: v_writelane_b32 v8, s45, 11 -; CHECK-NEXT: v_writelane_b32 v8, s46, 12 -; CHECK-NEXT: v_writelane_b32 v8, s47, 13 -; CHECK-NEXT: v_writelane_b32 v8, s48, 14 -; CHECK-NEXT: v_writelane_b32 v8, s49, 15 ; CHECK-NEXT: s_getpc_b64 s[24:25] -; CHECK-NEXT: v_writelane_b32 v8, s50, 16 -; CHECK-NEXT: s_movk_i32 s4, 0xf0 -; CHECK-NEXT: s_mov_b32 s5, s24 -; CHECK-NEXT: v_writelane_b32 v8, s51, 17 -; CHECK-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 -; CHECK-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: s_load_dwordx4 s[28:31], s[4:5], 0x0 -; CHECK-NEXT: s_movk_i32 s4, 0x130 -; CHECK-NEXT: s_mov_b32 s5, s24 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v4, s36, 0 -; CHECK-NEXT: v_writelane_b32 v4, s37, 1 -; CHECK-NEXT: v_writelane_b32 v4, s38, 2 -; CHECK-NEXT: v_writelane_b32 v4, s39, 3 -; CHECK-NEXT: v_writelane_b32 v4, s40, 4 -; CHECK-NEXT: v_writelane_b32 v4, s41, 5 -; CHECK-NEXT: v_writelane_b32 v4, s42, 6 -; CHECK-NEXT: v_writelane_b32 v4, s43, 7 -; CHECK-NEXT: v_writelane_b32 v4, s44, 8 -; CHECK-NEXT: v_writelane_b32 v4, s45, 9 -; CHECK-NEXT: v_writelane_b32 v4, s46, 10 -; CHECK-NEXT: s_load_dwordx16 s[4:19], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v4, s47, 11 -; CHECK-NEXT: v_writelane_b32 v4, s48, 12 -; CHECK-NEXT: v_writelane_b32 v4, s49, 13 -; CHECK-NEXT: s_mov_b32 s20, 0 +; CHECK-NEXT: s_movk_i32 s12, 0x110 +; CHECK-NEXT: s_mov_b64 s[14:15], 0 +; CHECK-NEXT: s_mov_b32 s13, s24 +; CHECK-NEXT: s_load_dwordx4 s[20:23], s[14:15], 0x0 +; CHECK-NEXT: s_load_dwordx8 s[4:11], s[12:13], 0x0 +; CHECK-NEXT: s_movk_i32 s12, 0x130 +; CHECK-NEXT: s_load_dwordx8 s[12:19], s[12:13], 0x0 +; CHECK-NEXT: s_mov_b32 s28, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: v_writelane_b32 v4, s50, 14 -; CHECK-NEXT: v_mov_b32_e32 v5, s28 -; CHECK-NEXT: v_mov_b32_e32 v6, v1 -; CHECK-NEXT: s_mov_b32 s21, s20 -; CHECK-NEXT: s_mov_b32 s22, s20 -; CHECK-NEXT: s_mov_b32 s23, s20 -; CHECK-NEXT: v_writelane_b32 v4, s51, 15 -; CHECK-NEXT: v_mov_b32_e32 v2, v1 -; CHECK-NEXT: image_sample_lz v5, v[5:6], s[44:51], s[20:23] dmask:0x1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v4, s4, 16 -; CHECK-NEXT: v_writelane_b32 v4, s5, 17 -; CHECK-NEXT: v_writelane_b32 v4, s6, 18 -; CHECK-NEXT: v_writelane_b32 v4, s7, 19 -; CHECK-NEXT: v_writelane_b32 v4, s8, 20 -; CHECK-NEXT: v_writelane_b32 v4, s9, 21 -; CHECK-NEXT: image_sample_lz v6, v[1:2], s[4:11], s[20:23] dmask:0x1 -; CHECK-NEXT: v_writelane_b32 v4, s10, 22 -; CHECK-NEXT: v_writelane_b32 v4, s11, 23 -; CHECK-NEXT: v_writelane_b32 v4, s12, 24 -; CHECK-NEXT: v_writelane_b32 v4, s13, 25 -; CHECK-NEXT: v_writelane_b32 v4, s14, 26 -; CHECK-NEXT: v_writelane_b32 v4, s15, 27 -; CHECK-NEXT: v_writelane_b32 v4, s16, 28 -; CHECK-NEXT: v_writelane_b32 v8, s52, 18 -; CHECK-NEXT: v_writelane_b32 v4, s17, 29 -; CHECK-NEXT: v_writelane_b32 v8, s53, 19 -; CHECK-NEXT: v_writelane_b32 v4, s18, 30 -; CHECK-NEXT: v_writelane_b32 v8, s54, 20 -; CHECK-NEXT: v_writelane_b32 v4, s19, 31 -; CHECK-NEXT: s_mov_b32 s4, 48 -; CHECK-NEXT: s_mov_b32 s5, s24 -; CHECK-NEXT: v_writelane_b32 v8, s55, 21 -; CHECK-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v8, s56, 22 -; CHECK-NEXT: v_writelane_b32 v8, s57, 23 -; CHECK-NEXT: v_writelane_b32 v8, s58, 24 -; CHECK-NEXT: v_writelane_b32 v8, s59, 25 -; CHECK-NEXT: v_writelane_b32 v8, s60, 26 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v4, s4, 32 -; CHECK-NEXT: v_writelane_b32 v8, s61, 27 -; CHECK-NEXT: v_writelane_b32 v4, s5, 33 -; CHECK-NEXT: v_writelane_b32 v8, s62, 28 -; CHECK-NEXT: v_writelane_b32 v4, s6, 34 -; CHECK-NEXT: v_writelane_b32 v8, s63, 29 -; CHECK-NEXT: v_writelane_b32 v4, s7, 35 -; CHECK-NEXT: v_writelane_b32 v8, s64, 30 -; CHECK-NEXT: v_writelane_b32 v4, s8, 36 -; CHECK-NEXT: v_writelane_b32 v8, s65, 31 -; CHECK-NEXT: v_writelane_b32 v4, s9, 37 -; CHECK-NEXT: v_writelane_b32 v8, s66, 32 -; CHECK-NEXT: s_movk_i32 s26, 0x1f0 -; CHECK-NEXT: s_movk_i32 s28, 0x2f0 -; CHECK-NEXT: s_mov_b32 s27, s24 -; CHECK-NEXT: s_mov_b32 s29, s24 -; CHECK-NEXT: v_writelane_b32 v4, s10, 38 -; CHECK-NEXT: v_writelane_b32 v8, s67, 33 -; CHECK-NEXT: v_writelane_b32 v4, s11, 39 -; CHECK-NEXT: s_load_dwordx16 s[52:67], s[26:27], 0x0 -; CHECK-NEXT: s_load_dwordx16 s[4:19], s[28:29], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v2, s20 +; CHECK-NEXT: v_mov_b32_e32 v3, v1 +; CHECK-NEXT: s_mov_b32 s20, s28 +; CHECK-NEXT: s_mov_b32 s21, s28 +; CHECK-NEXT: s_mov_b32 s22, s28 +; CHECK-NEXT: s_mov_b32 s23, s28 +; CHECK-NEXT: image_sample_lz v3, v[2:3], s[4:11], s[20:23] dmask:0x1 +; CHECK-NEXT: v_mov_b32_e32 v2, v1 +; CHECK-NEXT: image_sample_lz v4, v[1:2], s[12:19], s[20:23] dmask:0x1 +; CHECK-NEXT: v_writelane_b32 v6, s30, 0 +; CHECK-NEXT: v_writelane_b32 v6, s31, 1 +; CHECK-NEXT: v_writelane_b32 v6, s34, 2 +; CHECK-NEXT: v_writelane_b32 v6, s35, 3 +; CHECK-NEXT: v_writelane_b32 v6, s36, 4 +; CHECK-NEXT: v_writelane_b32 v6, s37, 5 ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_writelane_b32 v6, s38, 6 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; CHECK-NEXT: s_xor_b64 s[24:25], vcc, -1 -; CHECK-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane +; CHECK-NEXT: v_writelane_b32 v6, s39, 7 +; CHECK-NEXT: s_xor_b64 s[30:31], vcc, -1 +; CHECK-NEXT: s_mov_b32 s29, s24 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mul_f32_e32 v0, v6, v5 -; CHECK-NEXT: s_and_saveexec_b64 s[26:27], s[24:25] -; CHECK-NEXT: s_xor_b64 s[26:27], exec, s[26:27] +; CHECK-NEXT: v_mul_f32_e32 v0, v4, v3 +; CHECK-NEXT: s_and_saveexec_b64 s[12:13], s[30:31] +; CHECK-NEXT: s_xor_b64 s[34:35], exec, s[12:13] ; CHECK-NEXT: s_cbranch_execz .LBB0_3 ; CHECK-NEXT: ; %bb.1: ; %bb48 -; CHECK-NEXT: v_readlane_b32 s36, v4, 0 -; CHECK-NEXT: v_readlane_b32 s44, v4, 8 -; CHECK-NEXT: v_readlane_b32 s45, v4, 9 -; CHECK-NEXT: v_readlane_b32 s46, v4, 10 -; CHECK-NEXT: v_readlane_b32 s47, v4, 11 -; CHECK-NEXT: v_readlane_b32 s48, v4, 12 -; CHECK-NEXT: v_readlane_b32 s49, v4, 13 -; CHECK-NEXT: v_readlane_b32 s50, v4, 14 -; CHECK-NEXT: v_readlane_b32 s51, v4, 15 -; CHECK-NEXT: s_and_b64 vcc, exec, -1 -; CHECK-NEXT: v_readlane_b32 s37, v4, 1 -; CHECK-NEXT: v_readlane_b32 s38, v4, 2 -; CHECK-NEXT: v_readlane_b32 s39, v4, 3 -; CHECK-NEXT: v_readlane_b32 s40, v4, 4 -; CHECK-NEXT: image_sample_lz v5, v[1:2], s[44:51], s[20:23] dmask:0x1 +; CHECK-NEXT: image_sample_lz v3, v[1:2], s[4:11], s[20:23] dmask:0x1 +; CHECK-NEXT: s_load_dwordx4 s[24:27], s[28:29], 0x40 +; CHECK-NEXT: s_load_dwordx8 s[4:11], s[28:29], 0x210 +; CHECK-NEXT: s_load_dwordx8 s[12:19], s[28:29], 0x310 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: v_readlane_b32 s41, v4, 5 -; CHECK-NEXT: v_readlane_b32 s42, v4, 6 -; CHECK-NEXT: v_readlane_b32 s43, v4, 7 +; CHECK-NEXT: s_and_b64 vcc, exec, -1 ; CHECK-NEXT: .LBB0_2: ; %bb50 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_readlane_b32 s36, v4, 32 -; CHECK-NEXT: v_readlane_b32 s40, v4, 36 -; CHECK-NEXT: v_readlane_b32 s41, v4, 37 -; CHECK-NEXT: v_readlane_b32 s42, v4, 38 -; CHECK-NEXT: v_readlane_b32 s43, v4, 39 -; CHECK-NEXT: s_mov_b32 s21, s20 -; CHECK-NEXT: s_mov_b32 s22, s20 -; CHECK-NEXT: s_mov_b32 s23, s20 -; CHECK-NEXT: v_readlane_b32 s37, v4, 33 -; CHECK-NEXT: v_readlane_b32 s38, v4, 34 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: image_sample_lz v6, v[1:2], s[60:67], s[40:43] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s39, v4, 35 -; CHECK-NEXT: image_sample_lz v1, v[1:2], s[12:19], s[20:23] dmask:0x1 +; CHECK-NEXT: image_sample_lz v4, v[1:2], s[4:11], s[24:27] dmask:0x1 +; CHECK-NEXT: image_sample_lz v5, v[1:2], s[12:19], s[20:23] dmask:0x1 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_sub_f32_e32 v1, v1, v6 +; CHECK-NEXT: v_sub_f32_e32 v1, v5, v4 ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v0 -; CHECK-NEXT: v_mul_f32_e32 v1, v1, v5 +; CHECK-NEXT: v_mul_f32_e32 v1, v1, v3 ; CHECK-NEXT: s_mov_b64 vcc, vcc ; CHECK-NEXT: s_cbranch_vccnz .LBB0_2 -; CHECK-NEXT: .LBB0_3: ; %Flow14 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_readlane_b32 s12, v4, 32 -; CHECK-NEXT: v_readlane_b32 s13, v4, 33 -; CHECK-NEXT: v_readlane_b32 s14, v4, 34 -; CHECK-NEXT: v_readlane_b32 s15, v4, 35 -; CHECK-NEXT: v_readlane_b32 s16, v4, 36 -; CHECK-NEXT: v_readlane_b32 s17, v4, 37 -; CHECK-NEXT: v_readlane_b32 s18, v4, 38 -; CHECK-NEXT: v_readlane_b32 s19, v4, 39 -; CHECK-NEXT: v_writelane_b32 v4, s4, 40 -; CHECK-NEXT: v_writelane_b32 v4, s5, 41 -; CHECK-NEXT: v_writelane_b32 v4, s6, 42 -; CHECK-NEXT: v_writelane_b32 v4, s7, 43 -; CHECK-NEXT: v_writelane_b32 v4, s8, 44 -; CHECK-NEXT: v_writelane_b32 v4, s9, 45 -; CHECK-NEXT: v_writelane_b32 v4, s10, 46 -; CHECK-NEXT: v_writelane_b32 v4, s11, 47 -; CHECK-NEXT: v_writelane_b32 v4, s12, 48 -; CHECK-NEXT: v_writelane_b32 v4, s13, 49 -; CHECK-NEXT: v_writelane_b32 v4, s14, 50 -; CHECK-NEXT: v_writelane_b32 v4, s15, 51 -; CHECK-NEXT: v_writelane_b32 v4, s16, 52 -; CHECK-NEXT: v_writelane_b32 v4, s17, 53 -; CHECK-NEXT: v_writelane_b32 v4, s18, 54 -; CHECK-NEXT: v_writelane_b32 v4, s19, 55 -; CHECK-NEXT: v_writelane_b32 v4, s52, 56 -; CHECK-NEXT: v_writelane_b32 v3, s60, 0 -; CHECK-NEXT: v_writelane_b32 v4, s53, 57 -; CHECK-NEXT: v_writelane_b32 v3, s61, 1 -; CHECK-NEXT: v_writelane_b32 v4, s54, 58 -; CHECK-NEXT: v_writelane_b32 v3, s62, 2 -; CHECK-NEXT: v_writelane_b32 v4, s55, 59 -; CHECK-NEXT: v_writelane_b32 v3, s63, 3 -; CHECK-NEXT: v_writelane_b32 v4, s56, 60 -; CHECK-NEXT: v_writelane_b32 v3, s64, 4 -; CHECK-NEXT: v_writelane_b32 v4, s57, 61 -; CHECK-NEXT: v_writelane_b32 v3, s65, 5 -; CHECK-NEXT: v_writelane_b32 v4, s58, 62 -; CHECK-NEXT: v_writelane_b32 v3, s66, 6 -; CHECK-NEXT: v_writelane_b32 v4, s59, 63 -; CHECK-NEXT: v_writelane_b32 v3, s67, 7 -; CHECK-NEXT: s_andn2_saveexec_b64 s[20:21], s[26:27] +; CHECK-NEXT: .LBB0_3: ; %Flow4 +; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[34:35] ; CHECK-NEXT: s_cbranch_execz .LBB0_10 ; CHECK-NEXT: ; %bb.4: ; %bb32 -; CHECK-NEXT: s_and_saveexec_b64 s[8:9], s[24:25] -; CHECK-NEXT: s_xor_b64 s[22:23], exec, s[8:9] +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[30:31] +; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CHECK-NEXT: s_cbranch_execz .LBB0_6 ; CHECK-NEXT: ; %bb.5: ; %bb43 -; CHECK-NEXT: s_mov_b32 s8, 0 -; CHECK-NEXT: s_mov_b32 s9, s8 -; CHECK-NEXT: v_mov_b32_e32 v0, s8 -; CHECK-NEXT: v_readlane_b32 s36, v4, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, s9 -; CHECK-NEXT: s_mov_b32 s10, s8 -; CHECK-NEXT: s_mov_b32 s11, s8 -; CHECK-NEXT: v_readlane_b32 s37, v4, 1 -; CHECK-NEXT: v_readlane_b32 s38, v4, 2 -; CHECK-NEXT: v_readlane_b32 s39, v4, 3 -; CHECK-NEXT: v_readlane_b32 s40, v4, 4 -; CHECK-NEXT: v_readlane_b32 s41, v4, 5 -; CHECK-NEXT: v_readlane_b32 s42, v4, 6 -; CHECK-NEXT: v_readlane_b32 s43, v4, 7 -; CHECK-NEXT: v_readlane_b32 s44, v4, 8 -; CHECK-NEXT: v_readlane_b32 s45, v4, 9 -; CHECK-NEXT: v_readlane_b32 s46, v4, 10 -; CHECK-NEXT: v_readlane_b32 s47, v4, 11 -; CHECK-NEXT: v_readlane_b32 s48, v4, 12 -; CHECK-NEXT: v_readlane_b32 s49, v4, 13 -; CHECK-NEXT: v_readlane_b32 s50, v4, 14 -; CHECK-NEXT: v_readlane_b32 s51, v4, 15 -; CHECK-NEXT: image_sample_lz v5, v[0:1], s[36:43], s[8:11] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s36, v4, 16 -; CHECK-NEXT: v_readlane_b32 s44, v4, 24 -; CHECK-NEXT: v_readlane_b32 s45, v4, 25 -; CHECK-NEXT: v_readlane_b32 s46, v4, 26 -; CHECK-NEXT: v_readlane_b32 s47, v4, 27 -; CHECK-NEXT: v_readlane_b32 s48, v4, 28 -; CHECK-NEXT: v_readlane_b32 s49, v4, 29 -; CHECK-NEXT: v_readlane_b32 s50, v4, 30 -; CHECK-NEXT: v_readlane_b32 s51, v4, 31 -; CHECK-NEXT: v_mov_b32_e32 v6, 0 -; CHECK-NEXT: v_mov_b32_e32 v7, v6 -; CHECK-NEXT: v_readlane_b32 s37, v4, 17 -; CHECK-NEXT: v_readlane_b32 s38, v4, 18 -; CHECK-NEXT: v_readlane_b32 s39, v4, 19 -; CHECK-NEXT: image_sample_lz v0, v[0:1], s[44:51], s[12:15] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s40, v4, 20 -; CHECK-NEXT: v_readlane_b32 s41, v4, 21 -; CHECK-NEXT: v_readlane_b32 s42, v4, 22 -; CHECK-NEXT: v_readlane_b32 s43, v4, 23 +; CHECK-NEXT: s_load_dwordx4 s[24:27], s[28:29], 0x30 +; CHECK-NEXT: s_load_dwordx8 s[8:15], s[28:29], 0xf0 +; CHECK-NEXT: s_load_dwordx8 s[16:23], s[28:29], 0x150 +; CHECK-NEXT: s_mov_b32 s36, 0 +; CHECK-NEXT: s_mov_b32 s37, s36 +; CHECK-NEXT: v_mov_b32_e32 v2, s36 +; CHECK-NEXT: v_mov_b32_e32 v3, s37 +; CHECK-NEXT: s_mov_b32 s38, s36 +; CHECK-NEXT: s_mov_b32 s39, s36 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: image_sample_lz v1, v[2:3], s[8:15], s[36:39] dmask:0x1 +; CHECK-NEXT: image_sample_lz v0, v[2:3], s[16:23], s[24:27] dmask:0x1 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_mov_b32_e32 v3, v2 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_dwordx3 v[5:7], off, s[8:11], 0 +; CHECK-NEXT: buffer_store_dwordx3 v[1:3], off, s[36:39], 0 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 ; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: .LBB0_6: ; %Flow12 -; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[22:23] -; CHECK-NEXT: v_readlane_b32 s52, v4, 40 -; CHECK-NEXT: v_readlane_b32 s53, v4, 41 -; CHECK-NEXT: v_readlane_b32 s54, v4, 42 -; CHECK-NEXT: v_readlane_b32 s55, v4, 43 -; CHECK-NEXT: v_readlane_b32 s56, v4, 44 -; CHECK-NEXT: v_readlane_b32 s57, v4, 45 -; CHECK-NEXT: v_readlane_b32 s58, v4, 46 -; CHECK-NEXT: v_readlane_b32 s59, v4, 47 -; CHECK-NEXT: v_readlane_b32 s60, v4, 48 -; CHECK-NEXT: v_readlane_b32 s61, v4, 49 -; CHECK-NEXT: v_readlane_b32 s62, v4, 50 -; CHECK-NEXT: v_readlane_b32 s63, v4, 51 -; CHECK-NEXT: v_readlane_b32 s64, v4, 52 -; CHECK-NEXT: v_readlane_b32 s65, v4, 53 -; CHECK-NEXT: v_readlane_b32 s66, v4, 54 -; CHECK-NEXT: v_readlane_b32 s67, v4, 55 -; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5] +; CHECK-NEXT: .LBB0_6: ; %Flow2 +; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] ; CHECK-NEXT: s_cbranch_execz .LBB0_9 ; CHECK-NEXT: ; %bb.7: ; %bb33.preheader -; CHECK-NEXT: s_mov_b32 s8, 0 -; CHECK-NEXT: s_mov_b32 s6, s8 -; CHECK-NEXT: s_mov_b32 s7, s8 -; CHECK-NEXT: v_mov_b32_e32 v1, s6 -; CHECK-NEXT: v_readlane_b32 s36, v4, 56 -; CHECK-NEXT: s_mov_b32 s9, s8 -; CHECK-NEXT: s_mov_b32 s10, s8 -; CHECK-NEXT: s_mov_b32 s11, s8 -; CHECK-NEXT: v_mov_b32_e32 v2, s7 -; CHECK-NEXT: v_readlane_b32 s37, v4, 57 -; CHECK-NEXT: v_readlane_b32 s38, v4, 58 -; CHECK-NEXT: v_readlane_b32 s39, v4, 59 -; CHECK-NEXT: v_readlane_b32 s40, v4, 60 -; CHECK-NEXT: v_readlane_b32 s41, v4, 61 -; CHECK-NEXT: v_readlane_b32 s42, v4, 62 -; CHECK-NEXT: v_readlane_b32 s43, v4, 63 -; CHECK-NEXT: s_nop 4 -; CHECK-NEXT: image_sample_lz v5, v[1:2], s[36:43], s[8:11] dmask:0x1 -; CHECK-NEXT: image_sample_lz v6, v[1:2], s[52:59], s[8:11] dmask:0x1 -; CHECK-NEXT: ; kill: killed $vgpr1_vgpr2 -; CHECK-NEXT: s_mov_b64 s[12:13], s[36:37] +; CHECK-NEXT: s_load_dwordx8 s[8:15], s[28:29], 0x1f0 +; CHECK-NEXT: s_load_dwordx8 s[16:23], s[28:29], 0x2f0 +; CHECK-NEXT: s_mov_b32 s24, 0 +; CHECK-NEXT: s_mov_b32 s28, s24 +; CHECK-NEXT: s_mov_b32 s29, s24 +; CHECK-NEXT: v_mov_b32_e32 v1, s28 +; CHECK-NEXT: s_mov_b32 s25, s24 +; CHECK-NEXT: s_mov_b32 s26, s24 +; CHECK-NEXT: s_mov_b32 s27, s24 +; CHECK-NEXT: v_mov_b32_e32 v2, s29 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: image_sample_lz v3, v[1:2], s[8:15], s[24:27] dmask:0x1 +; CHECK-NEXT: image_sample_lz v4, v[1:2], s[16:23], s[24:27] dmask:0x1 ; CHECK-NEXT: s_and_b64 vcc, exec, 0 -; CHECK-NEXT: v_readlane_b32 s44, v3, 0 -; CHECK-NEXT: v_readlane_b32 s45, v3, 1 -; CHECK-NEXT: v_readlane_b32 s46, v3, 2 -; CHECK-NEXT: v_readlane_b32 s47, v3, 3 -; CHECK-NEXT: v_readlane_b32 s48, v3, 4 -; CHECK-NEXT: v_readlane_b32 s49, v3, 5 -; CHECK-NEXT: v_readlane_b32 s50, v3, 6 -; CHECK-NEXT: v_readlane_b32 s51, v3, 7 -; CHECK-NEXT: s_mov_b64 s[14:15], s[38:39] -; CHECK-NEXT: s_mov_b64 s[16:17], s[40:41] -; CHECK-NEXT: s_mov_b64 s[18:19], s[42:43] -; CHECK-NEXT: ; kill: killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 -; CHECK-NEXT: ; kill: killed $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59 -; CHECK-NEXT: ; kill: killed $sgpr8_sgpr9_sgpr10 killed $sgpr11 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_sub_f32_e32 v1, v6, v5 +; CHECK-NEXT: v_sub_f32_e32 v1, v4, v3 ; CHECK-NEXT: v_mul_f32_e32 v0, v1, v0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: .LBB0_8: ; %bb33 @@ -330,50 +118,20 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_sub_f32_e32 v1, v1, v2 ; CHECK-NEXT: s_mov_b64 vcc, vcc ; CHECK-NEXT: s_cbranch_vccz .LBB0_8 -; CHECK-NEXT: .LBB0_9: ; %Flow13 -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: .LBB0_9: ; %Flow3 +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] ; CHECK-NEXT: .LBB0_10: ; %UnifiedReturnBlock -; CHECK-NEXT: s_or_b64 exec, exec, s[20:21] -; CHECK-NEXT: v_readlane_b32 s67, v8, 33 -; CHECK-NEXT: v_readlane_b32 s66, v8, 32 -; CHECK-NEXT: v_readlane_b32 s65, v8, 31 -; CHECK-NEXT: v_readlane_b32 s64, v8, 30 -; CHECK-NEXT: v_readlane_b32 s63, v8, 29 -; CHECK-NEXT: v_readlane_b32 s62, v8, 28 -; CHECK-NEXT: v_readlane_b32 s61, v8, 27 -; CHECK-NEXT: v_readlane_b32 s60, v8, 26 -; CHECK-NEXT: v_readlane_b32 s59, v8, 25 -; CHECK-NEXT: v_readlane_b32 s58, v8, 24 -; CHECK-NEXT: v_readlane_b32 s57, v8, 23 -; CHECK-NEXT: v_readlane_b32 s56, v8, 22 -; CHECK-NEXT: v_readlane_b32 s55, v8, 21 -; CHECK-NEXT: v_readlane_b32 s54, v8, 20 -; CHECK-NEXT: v_readlane_b32 s53, v8, 19 -; CHECK-NEXT: v_readlane_b32 s52, v8, 18 -; CHECK-NEXT: v_readlane_b32 s51, v8, 17 -; CHECK-NEXT: v_readlane_b32 s50, v8, 16 -; CHECK-NEXT: v_readlane_b32 s49, v8, 15 -; CHECK-NEXT: v_readlane_b32 s48, v8, 14 -; CHECK-NEXT: v_readlane_b32 s47, v8, 13 -; CHECK-NEXT: v_readlane_b32 s46, v8, 12 -; CHECK-NEXT: v_readlane_b32 s45, v8, 11 -; CHECK-NEXT: v_readlane_b32 s44, v8, 10 -; CHECK-NEXT: v_readlane_b32 s43, v8, 9 -; CHECK-NEXT: v_readlane_b32 s42, v8, 8 -; CHECK-NEXT: v_readlane_b32 s41, v8, 7 -; CHECK-NEXT: v_readlane_b32 s40, v8, 6 -; CHECK-NEXT: v_readlane_b32 s39, v8, 5 -; CHECK-NEXT: v_readlane_b32 s38, v8, 4 -; CHECK-NEXT: v_readlane_b32 s37, v8, 3 -; CHECK-NEXT: v_readlane_b32 s36, v8, 2 -; CHECK-NEXT: v_readlane_b32 s31, v8, 1 -; CHECK-NEXT: v_readlane_b32 s30, v8, 0 -; CHECK-NEXT: ; kill: killed $vgpr4 -; CHECK-NEXT: ; kill: killed $vgpr3 +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: v_readlane_b32 s39, v6, 7 +; CHECK-NEXT: v_readlane_b32 s38, v6, 6 +; CHECK-NEXT: v_readlane_b32 s37, v6, 5 +; CHECK-NEXT: v_readlane_b32 s36, v6, 4 +; CHECK-NEXT: v_readlane_b32 s35, v6, 3 +; CHECK-NEXT: v_readlane_b32 s34, v6, 2 +; CHECK-NEXT: v_readlane_b32 s31, v6, 1 +; CHECK-NEXT: v_readlane_b32 s30, v6, 0 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll index fdd913867c8f8..3959aab9880e5 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -1805,21 +1805,21 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_bfe_i32 v4, v3, 0, 8 -; GFX8-NEXT: v_bfe_i32 v7, v3, 16, 8 +; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 +; GFX8-NEXT: v_bfe_i32 v5, v3, 16, 8 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_bfe_i32 v5, v2, 0, 8 -; GFX8-NEXT: v_mul_i32_i24_sdwa v6, sext(v3), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX8-NEXT: v_bfe_i32 v8, v2, 16, 8 -; GFX8-NEXT: v_mad_i32_i24 v4, v4, v5, v6 +; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8 +; GFX8-NEXT: v_mul_i32_i24_sdwa v4, sext(v3), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX8-NEXT: v_bfe_i32 v6, v0, 16, 8 +; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, v4 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 24, v3 -; GFX8-NEXT: v_ashrrev_i32_e32 v2, 24, v2 -; GFX8-NEXT: v_mad_i32_i24 v4, v7, v8, v4 -; GFX8-NEXT: v_mad_i32_i24 v2, v3, v2, v4 +; GFX8-NEXT: v_ashrrev_i32_e32 v0, 24, v0 +; GFX8-NEXT: v_mad_i32_i24 v1, v5, v6, v1 +; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -3233,7 +3233,6 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -3242,26 +3241,27 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, 0xff ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX8-NEXT: v_bfe_i32 v7, v7, 0, 8 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8 +; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 -; GFX8-NEXT: v_mul_lo_u16_sdwa v6, sext(v3), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v0 +; GFX8-NEXT: v_mul_lo_u16_sdwa v4, sext(v3), v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX8-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; GFX8-NEXT: v_mad_u16 v6, v8, v7, v6 +; GFX8-NEXT: v_mad_u16 v4, v6, v5, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX8-NEXT: v_mad_u16 v4, v4, v5, v6 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 -; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX8-NEXT: v_mad_u16 v1, v1, v2, v4 +; GFX8-NEXT: v_mad_u16 v0, v3, v0, v1 +; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index 9a1de74034cd8..a0bb56923674e 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -3238,21 +3238,21 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v3 -; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 8 +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 8 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v5, 0xff, v2 -; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 8 -; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v6 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 8 +; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX8-NEXT: v_mad_u32_u24 v4, v7, v8, v4 -; GFX8-NEXT: v_mad_u32_u24 v2, v3, v2, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX8-NEXT: v_mad_u32_u24 v1, v5, v6, v1 +; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -4932,21 +4932,21 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1, ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8 +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v4 +; GFX8-NEXT: v_bfe_u32 v2, v4, 16, 8 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v4 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v2 -; GFX8-NEXT: v_mul_u32_u24_sdwa v7, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 8 -; GFX8-NEXT: v_mad_u32_u24 v3, v6, v3, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4 -; GFX8-NEXT: v_mad_u32_u24 v3, v8, v5, v3 -; GFX8-NEXT: v_mad_u32_u24 v2, v2, v4, v3 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff, v0 +; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 8 +; GFX8-NEXT: v_mad_u32_u24 v1, v5, v1, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX8-NEXT: v_mad_u32_u24 v1, v6, v2, v1 +; GFX8-NEXT: v_mad_u32_u24 v2, v0, v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -5105,21 +5105,21 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1, ; GFX8-NEXT: flat_load_dword v4, v[0:1] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v4 +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v4 +; GFX8-NEXT: v_bfe_u32 v2, v4, 16, 8 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v4 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v2 -; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_2 -; GFX8-NEXT: v_bfe_u32 v8, v2, 8, 8 -; GFX8-NEXT: v_mad_u32_u24 v3, v3, v7, v4 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX8-NEXT: v_mad_u32_u24 v3, v5, v8, v3 -; GFX8-NEXT: v_mad_u32_u24 v2, v6, v2, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_2 +; GFX8-NEXT: v_bfe_u32 v6, v0, 8, 8 +; GFX8-NEXT: v_mad_u32_u24 v1, v1, v5, v4 +; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX8-NEXT: v_mad_u32_u24 v1, v2, v6, v1 +; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -5292,21 +5292,21 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1, ; GFX8-NEXT: flat_load_dword v2, v[0:1] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v2 -; GFX8-NEXT: v_bfe_u32 v7, v2, 8, 8 +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX8-NEXT: v_bfe_u32 v5, v2, 8, 8 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 8 -; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v3 -; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v6 +; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 8 +; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX8-NEXT: v_mad_u32_u24 v1, v1, v3, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX8-NEXT: v_bfe_u32 v3, v3, 8, 8 -; GFX8-NEXT: v_mad_u32_u24 v4, v7, v8, v4 -; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, v4 +; GFX8-NEXT: v_bfe_u32 v0, v0, 8, 8 +; GFX8-NEXT: v_mad_u32_u24 v1, v5, v6, v1 +; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -5633,21 +5633,21 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1, ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s6, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[1:2] -; GFX8-NEXT: flat_load_dword v4, v[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xff, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v4 -; GFX8-NEXT: v_mul_u32_u24_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1 -; GFX8-NEXT: v_bfe_u32 v7, v3, 8, 8 -; GFX8-NEXT: v_bfe_u32 v8, v4, 16, 8 -; GFX8-NEXT: v_mad_u32_u24 v2, v5, v6, v2 +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v0 +; GFX8-NEXT: v_mul_u32_u24_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1 +; GFX8-NEXT: v_bfe_u32 v5, v3, 8, 8 +; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 8 +; GFX8-NEXT: v_mad_u32_u24 v1, v1, v4, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4 -; GFX8-NEXT: v_mad_u32_u24 v2, v7, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v3, v4, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX8-NEXT: v_mad_u32_u24 v1, v5, v6, v1 +; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -5689,9 +5689,9 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_mov_b32 s1, 0x3020001 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7 ; GFX9-DL-NEXT: ; kill: killed $vgpr5 ; GFX9-DL-NEXT: ; kill: killed $vgpr4 -; GFX9-DL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7 ; GFX9-DL-NEXT: v_perm_b32 v2, v3, v2, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_perm_b32 v0, v0, v0, s1 @@ -5705,9 +5705,9 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7 ; GFX10-DL-NEXT: ; kill: killed $vgpr5 ; GFX10-DL-NEXT: ; kill: killed $vgpr4 -; GFX10-DL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5] ; GFX10-DL-NEXT: global_load_dword v0, v5, s[6:7] @@ -5822,32 +5822,31 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1, ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX8-NEXT: s_movk_i32 s2, 0xfc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v1 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v3 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xfc, v3 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX8-NEXT: v_bfe_u32 v3, v0, 16, 8 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX8-NEXT: v_mul_u32_u24_sdwa v7, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1 -; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v2 -; GFX8-NEXT: v_mad_u32_u24 v3, v6, v3, v7 -; GFX8-NEXT: v_bfe_u32 v2, v2, 8, 8 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4 -; GFX8-NEXT: v_mad_u32_u24 v3, v8, v5, v3 -; GFX8-NEXT: v_mad_u32_u24 v2, v2, v4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX8-NEXT: v_mul_u32_u24_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1 +; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v1 +; GFX8-NEXT: v_mad_u32_u24 v0, v5, v2, v0 +; GFX8-NEXT: v_bfe_u32 v1, v1, 8, 8 +; GFX8-NEXT: v_mad_u32_u24 v0, v6, v3, v0 +; GFX8-NEXT: v_mad_u32_u24 v2, v1, v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index f736ca7cd625a..d6a05ad62f8b1 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -110,13 +110,13 @@ define amdgpu_kernel void @float8_inselt(ptr addrspace(1) %out, <8 x float> %vec ; GCN-LABEL: float8_inselt: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 -; GCN-NEXT: s_load_dword s2, s[0:1], 0x64 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s1, s[0:1], 0x64 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_mov_b32 m0, s2 -; GCN-NEXT: s_add_u32 s2, s0, 16 -; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: s_add_u32 s0, s2, 16 +; GCN-NEXT: s_mov_b32 m0, s1 +; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_mov_b32_e32 v3, s7 @@ -124,13 +124,13 @@ define amdgpu_kernel void @float8_inselt(ptr addrspace(1) %out, <8 x float> %vec ; GCN-NEXT: v_mov_b32_e32 v5, s9 ; GCN-NEXT: v_mov_b32_e32 v6, s10 ; GCN-NEXT: v_mov_b32_e32 v7, s11 -; GCN-NEXT: v_mov_b32_e32 v9, s3 +; GCN-NEXT: v_mov_b32_e32 v9, s1 ; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 -; GCN-NEXT: v_mov_b32_e32 v8, s2 +; GCN-NEXT: v_mov_b32_e32 v8, s0 ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index 68427e8937bb9..8b75fe3780cf3 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -654,8 +654,8 @@ define amdgpu_kernel void @dynamic_insertelement_v4f32(ptr addrspace(1) %out, <4 define amdgpu_kernel void @dynamic_insertelement_v8f32(ptr addrspace(1) %out, <8 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v8f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 ; SI-NEXT: s_load_dword s4, s[4:5], 0x10 ; SI-NEXT: v_mov_b32_e32 v8, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 @@ -677,8 +677,8 @@ define amdgpu_kernel void @dynamic_insertelement_v8f32(ptr addrspace(1) %out, <8 ; ; VI-LABEL: dynamic_insertelement_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 ; VI-NEXT: s_load_dword s4, s[4:5], 0x40 ; VI-NEXT: v_mov_b32_e32 v8, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 @@ -1154,8 +1154,8 @@ define amdgpu_kernel void @dynamic_insertelement_v8i32(ptr addrspace(1) %out, <8 ; SI-LABEL: dynamic_insertelement_v8i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 +; SI-NEXT: s_load_dword s6, s[4:5], 0x10 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dword s4, s[4:5], 0x10 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1167,7 +1167,7 @@ define amdgpu_kernel void @dynamic_insertelement_v8i32(ptr addrspace(1) %out, <8 ; SI-NEXT: v_mov_b32_e32 v5, s13 ; SI-NEXT: v_mov_b32_e32 v6, s14 ; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: s_mov_b32 m0, s4 +; SI-NEXT: s_mov_b32 m0, s6 ; SI-NEXT: v_movreld_b32_e32 v0, 5 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -1176,8 +1176,8 @@ define amdgpu_kernel void @dynamic_insertelement_v8i32(ptr addrspace(1) %out, <8 ; VI-LABEL: dynamic_insertelement_v8i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; VI-NEXT: s_load_dword s6, s[4:5], 0x40 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x40 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1189,7 +1189,7 @@ define amdgpu_kernel void @dynamic_insertelement_v8i32(ptr addrspace(1) %out, <8 ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: v_mov_b32_e32 v6, s14 ; VI-NEXT: v_mov_b32_e32 v7, s15 -; VI-NEXT: s_mov_b32 m0, s4 +; VI-NEXT: s_mov_b32 m0, s6 ; VI-NEXT: v_movreld_b32_e32 v0, 5 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll index d056a97dc5444..181dd0e4dd66f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -406,69 +406,69 @@ define amdgpu_kernel void @maxnum_v2f16( ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s8, s8 -; VI-NEXT: v_max_f16_e64 v1, s2, s2 ; VI-NEXT: s_lshr_b32 s0, s8, 16 -; VI-NEXT: v_max_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s0, s0 -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 -; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: v_max_f16_e64 v0, s0, s0 +; VI-NEXT: v_max_f16_e64 v1, s1, s1 +; VI-NEXT: v_max_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_max_f16_e64 v1, s2, s2 +; VI-NEXT: v_max_f16_e64 v2, s8, s8 +; VI-NEXT: v_max_f16_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: maxnum_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 -; GFX9-NEXT: s_load_dword s11, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s10, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s11, s[8:9], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 ; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 -; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 +; GFX9-NEXT: v_pk_max_f16 v0, v0, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_v2f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 ; GFX10-NEXT: v_pk_max_f16 v1, s1, s1 -; GFX10-NEXT: v_pk_max_f16 v0, v1, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v0, v1 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: maxnum_v2f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 -; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX11-NEXT: v_pk_max_f16 v1, s2, s2 -; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX11-NEXT: v_pk_max_f16 v1, s0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v0, v1, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: v_pk_max_f16 v0, v0, v1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -707,40 +707,40 @@ define amdgpu_kernel void @maxnum_v3f16( ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s8, s8 -; VI-NEXT: v_max_f16_e64 v1, s2, s2 ; VI-NEXT: s_lshr_b32 s0, s8, 16 -; VI-NEXT: v_max_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s0, s0 -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 -; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_max_f16_e64 v1, s9, s9 -; VI-NEXT: v_max_f16_e64 v2, s3, s3 -; VI-NEXT: v_max_f16_e32 v1, v2, v1 +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: v_max_f16_e64 v0, s0, s0 +; VI-NEXT: v_max_f16_e64 v1, s1, s1 +; VI-NEXT: v_max_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_max_f16_e64 v1, s2, s2 +; VI-NEXT: v_max_f16_e64 v2, s8, s8 +; VI-NEXT: v_max_f16_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_max_f16_e64 v1, s3, s3 +; VI-NEXT: v_max_f16_e64 v2, s9, s9 +; VI-NEXT: v_max_f16_e32 v1, v1, v2 ; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: maxnum_v3f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 ; GFX9-NEXT: v_pk_max_f16 v1, s12, s12 ; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 -; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 +; GFX9-NEXT: v_pk_max_f16 v0, v0, v1 ; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 -; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 +; GFX9-NEXT: v_pk_max_f16 v1, v2, v1 ; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -748,11 +748,11 @@ define amdgpu_kernel void @maxnum_v3f16( ; GFX10-LABEL: maxnum_v3f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -760,8 +760,8 @@ define amdgpu_kernel void @maxnum_v3f16( ; GFX10-NEXT: v_pk_max_f16 v2, s9, s9 ; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 -; GFX10-NEXT: v_pk_max_f16 v1, v2, v1 -; GFX10-NEXT: v_pk_max_f16 v0, v3, v0 +; GFX10-NEXT: v_pk_max_f16 v1, v1, v2 +; GFX10-NEXT: v_pk_max_f16 v0, v0, v3 ; GFX10-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm @@ -769,24 +769,24 @@ define amdgpu_kernel void @maxnum_v3f16( ; GFX11-LABEL: maxnum_v3f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[2:3], s[6:7], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v1, s5, s5 -; GFX11-NEXT: v_pk_max_f16 v2, s3, s3 -; GFX11-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX11-NEXT: v_pk_max_f16 v3, s2, s2 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_pk_max_f16 v1, v2, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_pk_max_f16 v0, v3, v0 +; GFX11-NEXT: v_pk_max_f16 v1, s3, s3 +; GFX11-NEXT: v_pk_max_f16 v2, s1, s1 +; GFX11-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX11-NEXT: v_pk_max_f16 v3, s0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_max_f16 v1, v1, v2 +; GFX11-NEXT: v_pk_max_f16 v0, v0, v3 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:4 -; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:4 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -818,20 +818,20 @@ define amdgpu_kernel void @maxnum_v4f16( ; SI-NEXT: s_lshr_b32 s6, s6, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 ; SI-NEXT: s_lshr_b32 s6, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s6, s5, 16 -; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: s_lshr_b32 s6, s4, 16 +; SI-NEXT: s_lshr_b32 s7, s5, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 -; SI-NEXT: v_max_f32_e32 v3, v3, v5 -; SI-NEXT: v_max_f32_e32 v2, v2, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_max_f32_e32 v3, v3, v4 +; SI-NEXT: v_max_f32_e32 v2, v2, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_max_f32_e32 v1, v1, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_max_f32_e32 v0, v0, v4 +; SI-NEXT: v_max_f32_e32 v0, v0, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -853,56 +853,56 @@ define amdgpu_kernel void @maxnum_v4f16( ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s9, s9 -; VI-NEXT: v_max_f16_e64 v1, s3, s3 ; VI-NEXT: s_lshr_b32 s0, s9, 16 -; VI-NEXT: v_max_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s0, s0 -; VI-NEXT: s_lshr_b32 s0, s3, 16 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 -; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v1, v0, v1 -; VI-NEXT: v_max_f16_e64 v0, s8, s8 -; VI-NEXT: v_max_f16_e64 v2, s2, s2 +; VI-NEXT: s_lshr_b32 s1, s3, 16 +; VI-NEXT: v_max_f16_e64 v0, s0, s0 +; VI-NEXT: v_max_f16_e64 v1, s1, s1 +; VI-NEXT: v_max_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_max_f16_e64 v1, s3, s3 +; VI-NEXT: v_max_f16_e64 v2, s9, s9 +; VI-NEXT: v_max_f16_e32 v1, v1, v2 ; VI-NEXT: s_lshr_b32 s0, s8, 16 -; VI-NEXT: v_max_f16_e32 v0, v2, v0 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: v_or_b32_e32 v1, v1, v0 +; VI-NEXT: v_max_f16_e64 v0, s0, s0 ; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_max_f16_e64 v3, s0, s0 -; VI-NEXT: v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: v_max_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_max_f16_e64 v2, s2, s2 +; VI-NEXT: v_max_f16_e64 v3, s8, s8 +; VI-NEXT: v_max_f16_e32 v2, v2, v3 +; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: maxnum_v4f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 ; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 -; GFX9-NEXT: v_pk_max_f16 v1, v1, v0 +; GFX9-NEXT: v_pk_max_f16 v1, v0, v1 ; GFX9-NEXT: v_pk_max_f16 v0, s12, s12 -; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX9-NEXT: v_pk_max_f16 v0, v2, v0 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_v4f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -910,30 +910,30 @@ define amdgpu_kernel void @maxnum_v4f16( ; GFX10-NEXT: v_pk_max_f16 v1, s9, s9 ; GFX10-NEXT: v_pk_max_f16 v2, s0, s0 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 -; GFX10-NEXT: v_pk_max_f16 v1, v1, v0 -; GFX10-NEXT: v_pk_max_f16 v0, v3, v2 +; GFX10-NEXT: v_pk_max_f16 v1, v0, v1 +; GFX10-NEXT: v_pk_max_f16 v0, v2, v3 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: maxnum_v4f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[2:3], s[6:7], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v0, s5, s5 -; GFX11-NEXT: v_pk_max_f16 v1, s3, s3 -; GFX11-NEXT: v_pk_max_f16 v2, s4, s4 -; GFX11-NEXT: v_pk_max_f16 v3, s2, s2 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_pk_max_f16 v0, v3, v2 -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: v_pk_max_f16 v0, s3, s3 +; GFX11-NEXT: v_pk_max_f16 v1, s1, s1 +; GFX11-NEXT: v_pk_max_f16 v2, s2, s2 +; GFX11-NEXT: v_pk_max_f16 v3, s0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_max_f16 v1, v0, v1 +; GFX11-NEXT: v_pk_max_f16 v0, v2, v3 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll index f934a2de9247f..c5251468decc9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -431,68 +431,68 @@ define amdgpu_kernel void @minnum_v2f16_ieee( ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s8, s8 -; VI-NEXT: v_max_f16_e64 v1, s2, s2 ; VI-NEXT: s_lshr_b32 s0, s8, 16 -; VI-NEXT: v_min_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s0, s0 -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 -; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: v_max_f16_e64 v0, s0, s0 +; VI-NEXT: v_max_f16_e64 v1, s1, s1 +; VI-NEXT: v_min_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_max_f16_e64 v1, s2, s2 +; VI-NEXT: v_max_f16_e64 v2, s8, s8 +; VI-NEXT: v_min_f16_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: minnum_v2f16_ieee: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 -; GFX9-NEXT: s_load_dword s11, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s10, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s11, s[8:9], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 ; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 -; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 +; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_v2f16_ieee: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 ; GFX10-NEXT: v_pk_max_f16 v1, s1, s1 -; GFX10-NEXT: v_pk_min_f16 v0, v1, v0 +; GFX10-NEXT: v_pk_min_f16 v0, v0, v1 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: minnum_v2f16_ieee: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 -; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX11-NEXT: v_pk_max_f16 v1, s2, s2 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_pk_min_f16 v0, v1, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX11-NEXT: v_pk_max_f16 v1, s0, s0 +; GFX11-NEXT: v_pk_min_f16 v0, v0, v1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -764,40 +764,40 @@ define amdgpu_kernel void @minnum_v3f16( ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s8, s8 -; VI-NEXT: v_max_f16_e64 v1, s2, s2 ; VI-NEXT: s_lshr_b32 s0, s8, 16 -; VI-NEXT: v_min_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s0, s0 -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 -; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_max_f16_e64 v1, s9, s9 -; VI-NEXT: v_max_f16_e64 v2, s3, s3 -; VI-NEXT: v_min_f16_e32 v1, v2, v1 +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: v_max_f16_e64 v0, s0, s0 +; VI-NEXT: v_max_f16_e64 v1, s1, s1 +; VI-NEXT: v_min_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_max_f16_e64 v1, s2, s2 +; VI-NEXT: v_max_f16_e64 v2, s8, s8 +; VI-NEXT: v_min_f16_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_max_f16_e64 v1, s3, s3 +; VI-NEXT: v_max_f16_e64 v2, s9, s9 +; VI-NEXT: v_min_f16_e32 v1, v1, v2 ; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: minnum_v3f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 ; GFX9-NEXT: v_pk_max_f16 v1, s12, s12 ; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 -; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 +; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 ; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 -; GFX9-NEXT: v_pk_min_f16 v1, v1, v2 +; GFX9-NEXT: v_pk_min_f16 v1, v2, v1 ; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -805,11 +805,11 @@ define amdgpu_kernel void @minnum_v3f16( ; GFX10-LABEL: minnum_v3f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -817,8 +817,8 @@ define amdgpu_kernel void @minnum_v3f16( ; GFX10-NEXT: v_pk_max_f16 v2, s9, s9 ; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 -; GFX10-NEXT: v_pk_min_f16 v1, v2, v1 -; GFX10-NEXT: v_pk_min_f16 v0, v3, v0 +; GFX10-NEXT: v_pk_min_f16 v1, v1, v2 +; GFX10-NEXT: v_pk_min_f16 v0, v0, v3 ; GFX10-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm @@ -826,23 +826,23 @@ define amdgpu_kernel void @minnum_v3f16( ; GFX11-LABEL: minnum_v3f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[2:3], s[6:7], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v1, s5, s5 -; GFX11-NEXT: v_pk_max_f16 v2, s3, s3 -; GFX11-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX11-NEXT: v_pk_max_f16 v3, s2, s2 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_pk_min_f16 v1, v2, v1 -; GFX11-NEXT: v_pk_min_f16 v0, v3, v0 +; GFX11-NEXT: v_pk_max_f16 v1, s3, s3 +; GFX11-NEXT: v_pk_max_f16 v2, s1, s1 +; GFX11-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX11-NEXT: v_pk_max_f16 v3, s0, s0 +; GFX11-NEXT: v_pk_min_f16 v1, v1, v2 +; GFX11-NEXT: v_pk_min_f16 v0, v0, v3 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:4 -; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:4 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -874,20 +874,20 @@ define amdgpu_kernel void @minnum_v4f16( ; SI-NEXT: s_lshr_b32 s6, s6, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 ; SI-NEXT: s_lshr_b32 s6, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s6, s5, 16 -; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: s_lshr_b32 s6, s4, 16 +; SI-NEXT: s_lshr_b32 s7, s5, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 -; SI-NEXT: v_min_f32_e32 v3, v3, v5 -; SI-NEXT: v_min_f32_e32 v2, v2, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: v_min_f32_e32 v3, v3, v4 +; SI-NEXT: v_min_f32_e32 v2, v2, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_min_f32_e32 v1, v1, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_min_f32_e32 v0, v0, v4 +; SI-NEXT: v_min_f32_e32 v0, v0, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -909,56 +909,56 @@ define amdgpu_kernel void @minnum_v4f16( ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s9, s9 -; VI-NEXT: v_max_f16_e64 v1, s3, s3 ; VI-NEXT: s_lshr_b32 s0, s9, 16 -; VI-NEXT: v_min_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s0, s0 -; VI-NEXT: s_lshr_b32 s0, s3, 16 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 -; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v1, v0, v1 -; VI-NEXT: v_max_f16_e64 v0, s8, s8 -; VI-NEXT: v_max_f16_e64 v2, s2, s2 +; VI-NEXT: s_lshr_b32 s1, s3, 16 +; VI-NEXT: v_max_f16_e64 v0, s0, s0 +; VI-NEXT: v_max_f16_e64 v1, s1, s1 +; VI-NEXT: v_min_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_max_f16_e64 v1, s3, s3 +; VI-NEXT: v_max_f16_e64 v2, s9, s9 +; VI-NEXT: v_min_f16_e32 v1, v1, v2 ; VI-NEXT: s_lshr_b32 s0, s8, 16 -; VI-NEXT: v_min_f16_e32 v0, v2, v0 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: v_or_b32_e32 v1, v1, v0 +; VI-NEXT: v_max_f16_e64 v0, s0, s0 ; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_max_f16_e64 v3, s0, s0 -; VI-NEXT: v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: v_min_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_max_f16_e64 v2, s2, s2 +; VI-NEXT: v_max_f16_e64 v3, s8, s8 +; VI-NEXT: v_min_f16_e32 v2, v2, v3 +; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: minnum_v4f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 ; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 -; GFX9-NEXT: v_pk_min_f16 v1, v1, v0 +; GFX9-NEXT: v_pk_min_f16 v1, v0, v1 ; GFX9-NEXT: v_pk_max_f16 v0, s12, s12 -; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX9-NEXT: v_pk_min_f16 v0, v2, v0 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_v4f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -966,29 +966,29 @@ define amdgpu_kernel void @minnum_v4f16( ; GFX10-NEXT: v_pk_max_f16 v1, s9, s9 ; GFX10-NEXT: v_pk_max_f16 v2, s0, s0 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 -; GFX10-NEXT: v_pk_min_f16 v1, v1, v0 -; GFX10-NEXT: v_pk_min_f16 v0, v3, v2 +; GFX10-NEXT: v_pk_min_f16 v1, v0, v1 +; GFX10-NEXT: v_pk_min_f16 v0, v2, v3 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: minnum_v4f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[2:3], s[6:7], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v0, s5, s5 -; GFX11-NEXT: v_pk_max_f16 v1, s3, s3 -; GFX11-NEXT: v_pk_max_f16 v2, s4, s4 -; GFX11-NEXT: v_pk_max_f16 v3, s2, s2 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_pk_min_f16 v1, v1, v0 -; GFX11-NEXT: v_pk_min_f16 v0, v3, v2 -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: v_pk_max_f16 v0, s3, s3 +; GFX11-NEXT: v_pk_max_f16 v1, s1, s1 +; GFX11-NEXT: v_pk_max_f16 v2, s2, s2 +; GFX11-NEXT: v_pk_max_f16 v3, s0, s0 +; GFX11-NEXT: v_pk_min_f16 v1, v0, v1 +; GFX11-NEXT: v_pk_min_f16 v0, v2, v3 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll index cfaefca3a516d..4eb7fe1c72325 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll @@ -68,93 +68,100 @@ attributes #0 = { nounwind } define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocapture readonly %weights, ptr addrspace(1) noalias nocapture %out_ptr) { ; GFX6-NOHSA-LABEL: constant_load_2v4f64: ; GFX6-NOHSA: ; %bb.0: ; %entry -; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 +; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_load_dwordx2 s[24:25], s[18:19], 0x0 -; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX6-NOHSA-NEXT: s_mov_b32 s23, 0xf000 -; GFX6-NOHSA-NEXT: s_mov_b32 s22, -1 -; GFX6-NOHSA-NEXT: s_mov_b32 s20, s18 -; GFX6-NOHSA-NEXT: s_mov_b32 s21, s19 +; GFX6-NOHSA-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x0 +; GFX6-NOHSA-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0x0 +; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NOHSA-NEXT: s_mov_b32 s10, -1 +; GFX6-NOHSA-NEXT: s_mov_b32 s8, s2 +; GFX6-NOHSA-NEXT: s_mov_b32 s9, s3 +; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x8 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s25 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s21 +; GFX6-NOHSA-NEXT: v_add_f64 v[0:1], s[12:13], v[0:1] +; GFX6-NOHSA-NEXT: v_add_f64 v[0:1], s[14:15], v[0:1] +; GFX6-NOHSA-NEXT: v_add_f64 v[0:1], s[16:17], v[0:1] +; GFX6-NOHSA-NEXT: v_add_f64 v[0:1], s[18:19], v[0:1] ; GFX6-NOHSA-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] ; GFX6-NOHSA-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] ; GFX6-NOHSA-NEXT: v_add_f64 v[0:1], s[4:5], v[0:1] ; GFX6-NOHSA-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] -; GFX6-NOHSA-NEXT: v_add_f64 v[0:1], s[8:9], v[0:1] -; GFX6-NOHSA-NEXT: v_add_f64 v[0:1], s[10:11], v[0:1] -; GFX6-NOHSA-NEXT: v_add_f64 v[0:1], s[12:13], v[0:1] -; GFX6-NOHSA-NEXT: v_add_f64 v[0:1], s[14:15], v[0:1] -; GFX6-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[20:23], 0 +; GFX6-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_load_2v4f64: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0 -; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19 +; GFX7-HSA-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-HSA-NEXT: v_add_f64 v[0:1], s[4:5], v[0:1] +; GFX7-HSA-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7-HSA-NEXT: v_add_f64 v[0:1], s[8:9], v[0:1] +; GFX7-HSA-NEXT: v_add_f64 v[0:1], s[10:11], v[0:1] +; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-HSA-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX7-HSA-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] ; GFX7-HSA-NEXT: v_add_f64 v[0:1], s[4:5], v[0:1] ; GFX7-HSA-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] ; GFX7-HSA-NEXT: v_add_f64 v[0:1], s[8:9], v[0:1] ; GFX7-HSA-NEXT: v_add_f64 v[0:1], s[10:11], v[0:1] -; GFX7-HSA-NEXT: v_add_f64 v[0:1], s[12:13], v[0:1] -; GFX7-HSA-NEXT: v_add_f64 v[0:1], s[14:15], v[0:1] ; GFX7-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-HSA-NEXT: s_endpgm ; ; GFX8-NOHSA-LABEL: constant_load_2v4f64: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0 -; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s19 +; GFX8-NOHSA-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x0 +; GFX8-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NOHSA-NEXT: v_add_f64 v[0:1], s[4:5], v[0:1] +; GFX8-NOHSA-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX8-NOHSA-NEXT: v_add_f64 v[0:1], s[8:9], v[0:1] +; GFX8-NOHSA-NEXT: v_add_f64 v[0:1], s[10:11], v[0:1] +; GFX8-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s20 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s21 -; GFX8-NOHSA-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] -; GFX8-NOHSA-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] ; GFX8-NOHSA-NEXT: v_add_f64 v[0:1], s[4:5], v[0:1] ; GFX8-NOHSA-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] ; GFX8-NOHSA-NEXT: v_add_f64 v[0:1], s[8:9], v[0:1] ; GFX8-NOHSA-NEXT: v_add_f64 v[0:1], s[10:11], v[0:1] -; GFX8-NOHSA-NEXT: v_add_f64 v[0:1], s[12:13], v[0:1] -; GFX8-NOHSA-NEXT: v_add_f64 v[0:1], s[14:15], v[0:1] ; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NOHSA-NEXT: s_endpgm ; ; GFX12-LABEL: constant_load_2v4f64: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[16:19], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_load_b64 s[20:21], s[18:19], 0x0 -; GFX12-NEXT: s_load_b512 s[0:15], s[16:17], 0x0 +; GFX12-NEXT: s_load_b64 s[12:13], s[2:3], 0x0 +; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_f64_e64 v[0:1], s[0:1], s[20:21] +; GFX12-NEXT: v_add_f64_e64 v[0:1], s[4:5], s[12:13] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[0:1], s[2:3], v[0:1] +; GFX12-NEXT: v_add_f64_e32 v[0:1], s[6:7], v[0:1] +; GFX12-NEXT: v_add_f64_e32 v[0:1], s[8:9], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f64_e32 v[0:1], s[10:11], v[0:1] +; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x20 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_add_f64_e32 v[0:1], s[4:5], v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_f64_e32 v[0:1], s[6:7], v[0:1] ; GFX12-NEXT: v_add_f64_e32 v[0:1], s[8:9], v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[0:1], s[10:11], v[0:1] -; GFX12-NEXT: v_add_f64_e32 v[0:1], s[12:13], v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_f64_e32 v[0:1], s[14:15], v[0:1] -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[18:19] +; GFX12-NEXT: v_add_f64_e32 v[0:1], s[10:11], v[0:1] +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll index 6672568b98a20..ada333bb535c6 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll @@ -43,7 +43,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_addc_u32 s11, s11, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 -; CHECK-NEXT: s_load_dwordx8 s[44:51], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[44:47], s[6:7], 0x10 ; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7] ; CHECK-NEXT: s_addc_u32 s1, s1, 0 @@ -101,7 +101,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] ; CHECK-NEXT: s_mov_b32 s12, s41 ; CHECK-NEXT: s_mov_b32 s13, s40 -; CHECK-NEXT: global_load_dword v0, v0, s[48:49] +; CHECK-NEXT: global_load_dword v0, v0, s[44:45] ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: s_getpc_b64 s[6:7] ; CHECK-NEXT: s_add_u32 s6, s6, _Z3minjj@rel32@lo+4 @@ -138,44 +138,44 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_lshlrev_b32_e32 v43, 10, v43 ; CHECK-NEXT: v_add_nc_u32_e32 v46, 0x3c05, v0 ; CHECK-NEXT: v_mov_b32_e32 v47, 0 -; CHECK-NEXT: s_mov_b32 s49, 0 +; CHECK-NEXT: s_mov_b32 s45, 0 ; CHECK-NEXT: .LBB0_5: ; =>This Loop Header: Depth=1 ; CHECK-NEXT: ; Child Loop BB0_8 Depth 2 ; CHECK-NEXT: ; Child Loop BB0_20 Depth 2 -; CHECK-NEXT: v_add_nc_u32_e32 v0, s49, v44 -; CHECK-NEXT: s_lshl_b32 s4, s49, 5 -; CHECK-NEXT: s_add_i32 s48, s49, 1 -; CHECK-NEXT: s_add_i32 s5, s49, 5 -; CHECK-NEXT: v_or3_b32 v57, s4, v43, s48 +; CHECK-NEXT: v_add_nc_u32_e32 v0, s45, v44 +; CHECK-NEXT: s_lshl_b32 s4, s45, 5 +; CHECK-NEXT: s_add_i32 s44, s45, 1 +; CHECK-NEXT: s_add_i32 s5, s45, 5 +; CHECK-NEXT: v_or3_b32 v57, s4, v43, s44 ; CHECK-NEXT: ds_read_u8 v0, v0 -; CHECK-NEXT: v_mov_b32_e32 v58, s48 -; CHECK-NEXT: s_mov_b32 s52, exec_lo +; CHECK-NEXT: v_mov_b32_e32 v58, s44 +; CHECK-NEXT: s_mov_b32 s48, exec_lo ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v56, 0xff, v0 ; CHECK-NEXT: v_cmpx_lt_u32_e64 s5, v42 ; CHECK-NEXT: s_cbranch_execz .LBB0_17 ; CHECK-NEXT: ; %bb.6: ; %.preheader2 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_mov_b32 s53, 0 -; CHECK-NEXT: s_mov_b32 s54, 0 +; CHECK-NEXT: s_mov_b32 s49, 0 +; CHECK-NEXT: s_mov_b32 s50, 0 ; CHECK-NEXT: s_branch .LBB0_8 ; CHECK-NEXT: .LBB0_7: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 -; CHECK-NEXT: s_add_i32 s54, s54, 4 -; CHECK-NEXT: s_add_i32 s4, s49, s54 -; CHECK-NEXT: v_add_nc_u32_e32 v0, s54, v57 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s51 +; CHECK-NEXT: s_add_i32 s50, s50, 4 +; CHECK-NEXT: s_add_i32 s4, s45, s50 +; CHECK-NEXT: v_add_nc_u32_e32 v0, s50, v57 ; CHECK-NEXT: s_add_i32 s5, s4, 5 ; CHECK-NEXT: s_add_i32 s4, s4, 1 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s5, v42 ; CHECK-NEXT: v_mov_b32_e32 v58, s4 -; CHECK-NEXT: s_or_b32 s53, vcc_lo, s53 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s53 +; CHECK-NEXT: s_or_b32 s49, vcc_lo, s49 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s49 ; CHECK-NEXT: s_cbranch_execz .LBB0_16 ; CHECK-NEXT: .LBB0_8: ; Parent Loop BB0_5 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 -; CHECK-NEXT: v_add_nc_u32_e32 v59, s54, v46 -; CHECK-NEXT: v_add_nc_u32_e32 v58, s54, v57 -; CHECK-NEXT: s_mov_b32 s55, exec_lo +; CHECK-NEXT: v_add_nc_u32_e32 v59, s50, v46 +; CHECK-NEXT: v_add_nc_u32_e32 v58, s50, v57 +; CHECK-NEXT: s_mov_b32 s51, exec_lo ; CHECK-NEXT: ds_read_u8 v0, v59 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmpx_eq_u16_e64 v56, v0 @@ -198,9 +198,9 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v58 ; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s51 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:1 -; CHECK-NEXT: s_mov_b32 s55, exec_lo +; CHECK-NEXT: s_mov_b32 s51, exec_lo ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmpx_eq_u16_e64 v56, v0 ; CHECK-NEXT: s_cbranch_execz .LBB0_12 @@ -223,9 +223,9 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v60 ; CHECK-NEXT: .LBB0_12: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s51 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:2 -; CHECK-NEXT: s_mov_b32 s55, exec_lo +; CHECK-NEXT: s_mov_b32 s51, exec_lo ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmpx_eq_u16_e64 v56, v0 ; CHECK-NEXT: s_cbranch_execz .LBB0_14 @@ -248,9 +248,9 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v60 ; CHECK-NEXT: .LBB0_14: ; in Loop: Header=BB0_8 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s51 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:3 -; CHECK-NEXT: s_mov_b32 s55, exec_lo +; CHECK-NEXT: s_mov_b32 s51, exec_lo ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmpx_eq_u16_e64 v56, v0 ; CHECK-NEXT: s_cbranch_execz .LBB0_7 @@ -275,32 +275,32 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_branch .LBB0_7 ; CHECK-NEXT: .LBB0_16: ; %Flow45 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s53 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s49 ; CHECK-NEXT: v_mov_b32_e32 v57, v0 ; CHECK-NEXT: .LBB0_17: ; %Flow46 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52 -; CHECK-NEXT: s_mov_b32 s49, exec_lo +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s48 +; CHECK-NEXT: s_mov_b32 s45, exec_lo ; CHECK-NEXT: v_cmpx_lt_u32_e64 v58, v42 ; CHECK-NEXT: s_cbranch_execz .LBB0_23 ; CHECK-NEXT: ; %bb.18: ; %.preheader ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_mov_b32 s52, 0 +; CHECK-NEXT: s_mov_b32 s48, 0 ; CHECK-NEXT: s_inst_prefetch 0x1 ; CHECK-NEXT: s_branch .LBB0_20 ; CHECK-NEXT: .p2align 6 ; CHECK-NEXT: .LBB0_19: ; in Loop: Header=BB0_20 Depth=2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s53 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s49 ; CHECK-NEXT: v_add_nc_u32_e32 v58, 1, v58 ; CHECK-NEXT: v_add_nc_u32_e32 v57, 1, v57 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v58, v42 -; CHECK-NEXT: s_or_b32 s52, vcc_lo, s52 -; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s52 +; CHECK-NEXT: s_or_b32 s48, vcc_lo, s48 +; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s48 ; CHECK-NEXT: s_cbranch_execz .LBB0_22 ; CHECK-NEXT: .LBB0_20: ; Parent Loop BB0_5 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v58 -; CHECK-NEXT: s_mov_b32 s53, exec_lo +; CHECK-NEXT: s_mov_b32 s49, exec_lo ; CHECK-NEXT: ds_read_u8 v0, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cmpx_eq_u16_e64 v56, v0 @@ -326,15 +326,15 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: .LBB0_22: ; %Flow43 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: s_inst_prefetch 0x2 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s48 ; CHECK-NEXT: .LBB0_23: ; %Flow44 ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s49 +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s45 ; CHECK-NEXT: ; %bb.24: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s48, v45 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s44, v45 ; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v47 ; CHECK-NEXT: v_add_nc_u32_e32 v46, 1, v46 -; CHECK-NEXT: s_mov_b32 s49, s48 +; CHECK-NEXT: s_mov_b32 s45, s44 ; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 ; CHECK-NEXT: s_and_b32 s4, exec_lo, s4 ; CHECK-NEXT: s_or_b32 s43, s4, s43 @@ -362,6 +362,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_cmpx_gt_u32_e64 v47, v41 ; CHECK-NEXT: s_cbranch_execz .LBB0_33 ; CHECK-NEXT: ; %bb.26: +; CHECK-NEXT: s_load_dwordx4 s[48:51], s[34:35], 0x0 ; CHECK-NEXT: s_mov_b32 s42, 0 ; CHECK-NEXT: s_branch .LBB0_28 ; CHECK-NEXT: .LBB0_27: ; in Loop: Header=BB0_28 Depth=1 @@ -395,8 +396,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_mul_u32_u24_e32 v1, 0x180, v63 ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 5, v62 ; CHECK-NEXT: v_lshlrev_b32_e32 v4, 5, v72 -; CHECK-NEXT: v_add_co_u32 v2, s4, s44, v1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, s45, 0, s4 +; CHECK-NEXT: v_add_co_u32 v2, s4, s48, v1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, s49, 0, s4 ; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo ; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 @@ -441,8 +442,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffc, v0 ; CHECK-NEXT: v_lshlrev_b32_e64 v44, v1, 1 ; CHECK-NEXT: v_and_b32_e32 v74, 28, v1 -; CHECK-NEXT: v_add_co_u32 v42, s4, s50, v0 -; CHECK-NEXT: v_add_co_ci_u32_e64 v43, null, s51, 0, s4 +; CHECK-NEXT: v_add_co_u32 v42, s4, s46, v0 +; CHECK-NEXT: v_add_co_ci_u32_e64 v43, null, s47, 0, s4 ; CHECK-NEXT: v_mov_b32_e32 v2, v44 ; CHECK-NEXT: v_mov_b32_e32 v0, v42 ; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39] @@ -456,7 +457,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: ; %bb.30: ; in Loop: Header=BB0_28 Depth=1 ; CHECK-NEXT: v_xor_b32_e32 v4, v60, v58 ; CHECK-NEXT: v_lshrrev_b64 v[2:3], 16, v[56:57] -; CHECK-NEXT: v_mad_u64_u32 v[6:7], null, 0x180, v73, s[46:47] +; CHECK-NEXT: v_mad_u64_u32 v[6:7], null, 0x180, v73, s[50:51] ; CHECK-NEXT: v_lshlrev_b32_e32 v10, 5, v0 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; CHECK-NEXT: v_lshlrev_b32_e32 v8, 6, v72 diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll index 940287d44d8d1..d93cd39663a99 100644 --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -72,21 +72,22 @@ define amdgpu_kernel void @scalar_clause(ptr addrspace(1) noalias nocapture read ; GCN-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[16:17], 0x0 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[16:17], 0x20 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NEXT: v_mov_b32_e32 v8, s8 +; GCN-NEXT: v_mov_b32_e32 v12, s12 ; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: v_mov_b32_e32 v6, s6 ; GCN-NEXT: v_mov_b32_e32 v7, s7 -; GCN-NEXT: v_mov_b32_e32 v8, s8 ; GCN-NEXT: v_mov_b32_e32 v9, s9 ; GCN-NEXT: v_mov_b32_e32 v10, s10 ; GCN-NEXT: v_mov_b32_e32 v11, s11 -; GCN-NEXT: v_mov_b32_e32 v12, s12 ; GCN-NEXT: v_mov_b32_e32 v13, s13 ; GCN-NEXT: v_mov_b32_e32 v14, s14 ; GCN-NEXT: v_mov_b32_e32 v15, s15 @@ -101,7 +102,9 @@ define amdgpu_kernel void @scalar_clause(ptr addrspace(1) noalias nocapture read ; GCN-SCRATCH-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v16, 0 ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GCN-SCRATCH-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GCN-SCRATCH-NEXT: s_clause 0x1 +; GCN-SCRATCH-NEXT: s_load_dwordx8 s[0:7], s[16:17], 0x0 +; GCN-SCRATCH-NEXT: s_load_dwordx8 s[8:15], s[16:17], 0x20 ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, s0 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index 9dafa27ece86f..17466b5fb6f3c 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -2804,7 +2804,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; CI-LABEL: s_test_umin_ult_v8i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x8 +; CI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 +; CI-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x10 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s4, s11, s19 @@ -2835,7 +2836,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; VI-LABEL: s_test_umin_ult_v8i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x20 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; VI-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x40 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s4, s11, s19 @@ -2866,7 +2868,8 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; GFX9-LABEL: s_test_umin_ult_v8i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x40 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2893,8 +2896,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; GFX10-LABEL: s_test_umin_ult_v8i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x20 +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; GFX10-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x40 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2920,8 +2924,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; GFX11-LABEL: s_test_umin_ult_v8i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x20 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x20 +; GFX11-NEXT: s_load_b256 s[12:19], s[0:1], 0x40 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index b4272049f36a4..3844c23940867 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -2403,101 +2403,116 @@ endif: define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) { ; SI-LABEL: mul64_in_branch: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 -; SI-NEXT: s_and_b64 vcc, exec, s[10:11] -; SI-NEXT: s_cbranch_vccz .LBB16_4 +; SI-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 +; SI-NEXT: s_and_b64 vcc, exec, s[2:3] +; SI-NEXT: s_cbranch_vccz .LBB16_2 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: v_mul_hi_u32 v0, s4, v0 -; SI-NEXT: s_mul_i32 s7, s4, s7 -; SI-NEXT: s_mul_i32 s5, s5, s6 -; SI-NEXT: s_mul_i32 s4, s4, s6 -; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, s5, v0 -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_mul_i32 s2, s4, s7 +; SI-NEXT: s_mul_i32 s3, s5, s6 +; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; SI-NEXT: s_mul_i32 s2, s4, s6 +; SI-NEXT: v_add_i32_e32 v1, vcc, s3, v0 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_branch .LBB16_3 +; SI-NEXT: .LBB16_2: +; SI-NEXT: s_mov_b64 s[8:9], -1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: .LBB16_3: ; %Flow +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; SI-NEXT: s_cbranch_vccnz .LBB16_3 -; SI-NEXT: .LBB16_2: ; %if +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 vcc, vcc +; SI-NEXT: s_cbranch_vccnz .LBB16_5 +; SI-NEXT: ; %bb.4: ; %if ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; SI-NEXT: .LBB16_3: ; %endif +; SI-NEXT: .LBB16_5: ; %endif ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm -; SI-NEXT: .LBB16_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SI-NEXT: s_branch .LBB16_2 ; ; VI-LABEL: mul64_in_branch: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u64 s[4:5], 0 -; VI-NEXT: s_cbranch_scc0 .LBB16_4 +; VI-NEXT: s_cbranch_scc0 .LBB16_2 ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mad_u64_u32 v[0:1], s[10:11], s4, v0, 0 -; VI-NEXT: s_mul_i32 s4, s4, s7 -; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 -; VI-NEXT: s_mul_i32 s4, s5, s6 -; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s4, v0, 0 +; VI-NEXT: s_mul_i32 s2, s4, s7 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 +; VI-NEXT: s_mul_i32 s2, s5, s6 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 +; VI-NEXT: s_branch .LBB16_3 +; VI-NEXT: .LBB16_2: +; VI-NEXT: s_mov_b64 s[8:9], -1 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; VI-NEXT: .LBB16_3: ; %Flow +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; VI-NEXT: s_cbranch_vccnz .LBB16_3 -; VI-NEXT: .LBB16_2: ; %if +; VI-NEXT: s_cbranch_vccnz .LBB16_5 +; VI-NEXT: ; %bb.4: ; %if ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, s2 ; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; VI-NEXT: .LBB16_3: ; %endif +; VI-NEXT: .LBB16_5: ; %endif +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm -; VI-NEXT: .LBB16_4: -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 -; VI-NEXT: s_branch .LBB16_2 ; ; GFX9-LABEL: mul64_in_branch: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB16_3 +; GFX9-NEXT: s_cbranch_scc0 .LBB16_2 ; GFX9-NEXT: ; %bb.1: ; %else -; GFX9-NEXT: s_mul_i32 s7, s4, s7 -; GFX9-NEXT: s_mul_hi_u32 s10, s4, s6 -; GFX9-NEXT: s_add_i32 s7, s10, s7 -; GFX9-NEXT: s_mul_i32 s5, s5, s6 -; GFX9-NEXT: s_add_i32 s5, s7, s5 +; GFX9-NEXT: s_mul_i32 s2, s4, s7 +; GFX9-NEXT: s_mul_hi_u32 s3, s4, s6 +; GFX9-NEXT: s_add_i32 s2, s3, s2 +; GFX9-NEXT: s_mul_i32 s3, s5, s6 +; GFX9-NEXT: s_add_i32 s5, s2, s3 ; GFX9-NEXT: s_mul_i32 s4, s4, s6 +; GFX9-NEXT: s_branch .LBB16_3 +; GFX9-NEXT: .LBB16_2: +; GFX9-NEXT: s_mov_b64 s[8:9], -1 +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-NEXT: .LBB16_3: ; %Flow +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; GFX9-NEXT: s_cbranch_vccnz .LBB16_4 -; GFX9-NEXT: .LBB16_2: ; %if +; GFX9-NEXT: s_cbranch_vccnz .LBB16_5 +; GFX9-NEXT: ; %bb.4: ; %if ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s4, s2 ; GFX9-NEXT: s_mov_b32 s5, s3 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; GFX9-NEXT: s_branch .LBB16_5 -; GFX9-NEXT: .LBB16_3: -; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX9-NEXT: s_branch .LBB16_2 -; GFX9-NEXT: .LBB16_4: +; GFX9-NEXT: s_branch .LBB16_6 +; GFX9-NEXT: .LBB16_5: ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: .LBB16_5: ; %endif +; GFX9-NEXT: .LBB16_6: ; %endif +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2506,32 +2521,39 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX10-LABEL: mul64_in_branch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX10-NEXT: s_cbranch_scc0 .LBB16_3 +; GFX10-NEXT: s_cbranch_scc0 .LBB16_2 ; GFX10-NEXT: ; %bb.1: ; %else -; GFX10-NEXT: s_mul_i32 s7, s4, s7 -; GFX10-NEXT: s_mul_hi_u32 s8, s4, s6 +; GFX10-NEXT: s_mul_i32 s2, s4, s7 +; GFX10-NEXT: s_mul_hi_u32 s3, s4, s6 ; GFX10-NEXT: s_mul_i32 s5, s5, s6 -; GFX10-NEXT: s_add_i32 s7, s8, s7 +; GFX10-NEXT: s_add_i32 s2, s3, s2 ; GFX10-NEXT: s_mul_i32 s4, s4, s6 -; GFX10-NEXT: s_add_i32 s5, s7, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB16_4 -; GFX10-NEXT: .LBB16_2: ; %if +; GFX10-NEXT: s_add_i32 s5, s2, s5 +; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: s_branch .LBB16_3 +; GFX10-NEXT: .LBB16_2: +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX10-NEXT: .LBB16_3: ; %Flow +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s6 +; GFX10-NEXT: s_cbranch_vccnz .LBB16_5 +; GFX10-NEXT: ; %bb.4: ; %if ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s4, s2 ; GFX10-NEXT: s_mov_b32 s5, s3 ; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; GFX10-NEXT: s_branch .LBB16_5 -; GFX10-NEXT: .LBB16_3: -; GFX10-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX10-NEXT: s_branch .LBB16_2 -; GFX10-NEXT: .LBB16_4: +; GFX10-NEXT: s_branch .LBB16_6 +; GFX10-NEXT: .LBB16_5: ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: .LBB16_5: ; %endif +; GFX10-NEXT: .LBB16_6: ; %endif +; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2540,31 +2562,38 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX11-LABEL: mul64_in_branch: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB16_3 +; GFX11-NEXT: s_cbranch_scc0 .LBB16_2 ; GFX11-NEXT: ; %bb.1: ; %else -; GFX11-NEXT: s_mul_i32 s7, s4, s7 -; GFX11-NEXT: s_mul_hi_u32 s8, s4, s6 +; GFX11-NEXT: s_mul_i32 s2, s4, s7 +; GFX11-NEXT: s_mul_hi_u32 s3, s4, s6 ; GFX11-NEXT: s_mul_i32 s5, s5, s6 -; GFX11-NEXT: s_add_i32 s7, s8, s7 +; GFX11-NEXT: s_add_i32 s2, s3, s2 ; GFX11-NEXT: s_mul_i32 s4, s4, s6 -; GFX11-NEXT: s_add_i32 s5, s7, s5 -; GFX11-NEXT: s_cbranch_execnz .LBB16_4 -; GFX11-NEXT: .LBB16_2: ; %if +; GFX11-NEXT: s_add_i32 s5, s2, s5 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_branch .LBB16_3 +; GFX11-NEXT: .LBB16_2: +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX11-NEXT: .LBB16_3: ; %Flow +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB16_5 +; GFX11-NEXT: ; %bb.4: ; %if ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s4, s2 ; GFX11-NEXT: s_mov_b32 s5, s3 ; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: s_branch .LBB16_5 -; GFX11-NEXT: .LBB16_3: -; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX11-NEXT: s_branch .LBB16_2 -; GFX11-NEXT: .LBB16_4: +; GFX11-NEXT: s_branch .LBB16_6 +; GFX11-NEXT: .LBB16_5: ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: .LBB16_5: ; %endif +; GFX11-NEXT: .LBB16_6: ; %endif +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2575,26 +2604,33 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX12-LABEL: mul64_in_branch: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX12-NEXT: s_cbranch_scc0 .LBB16_3 +; GFX12-NEXT: s_cbranch_scc0 .LBB16_2 ; GFX12-NEXT: ; %bb.1: ; %else ; GFX12-NEXT: s_mul_u64 s[4:5], s[4:5], s[6:7] -; GFX12-NEXT: s_cbranch_execnz .LBB16_4 -; GFX12-NEXT: .LBB16_2: ; %if +; GFX12-NEXT: s_mov_b32 s6, 0 +; GFX12-NEXT: s_branch .LBB16_3 +; GFX12-NEXT: .LBB16_2: +; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX12-NEXT: .LBB16_3: ; %Flow +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX12-NEXT: s_cbranch_vccnz .LBB16_5 +; GFX12-NEXT: ; %bb.4: ; %if ; GFX12-NEXT: s_mov_b32 s7, 0x31016000 ; GFX12-NEXT: s_mov_b32 s6, -1 +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s4, s2 ; GFX12-NEXT: s_mov_b32 s5, s3 ; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null -; GFX12-NEXT: s_branch .LBB16_5 -; GFX12-NEXT: .LBB16_3: -; GFX12-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX12-NEXT: s_branch .LBB16_2 -; GFX12-NEXT: .LBB16_4: +; GFX12-NEXT: s_branch .LBB16_6 +; GFX12-NEXT: .LBB16_5: ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: .LBB16_5: ; %endif +; GFX12-NEXT: .LBB16_6: ; %endif +; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s3, 0x31016000 ; GFX12-NEXT: s_mov_b32 s2, -1 ; GFX12-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll index 4eefff504f19e..1a5810cc716a0 100644 --- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -85,7 +85,7 @@ ; GCN-DAG: s_or_b64 [[EXIT0]], [[EXIT0]], [[TMP0]] ; GCN-DAG: s_or_b64 [[EXIT1]], [[EXIT1]], [[TMP1]] -; GCN: ; %Flow4 +; GCN: ; %Flow3 ; GCN-NEXT: s_or_b64 exec, exec, ; GCN-NEXT: s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[EXIT1]] ; GCN-NEXT: s_xor_b64 @@ -94,7 +94,7 @@ ; GCN-DAG: ds_write_b32 ; GCN-DAG: s_andn2_b64 [[EXIT0]], [[EXIT0]], exec -; GCN: ; %Flow5 +; GCN: ; %Flow4 ; GCN-NEXT: s_or_b64 exec, exec, ; GCN-NEXT: s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[EXIT0]] diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll index ea30a63b0be19..95498152b6425 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -1085,130 +1085,130 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_add_i32 s1, s1, s4 ; GFX9-NEXT: s_xor_b32 s6, s1, s4 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s8, v4 -; GFX9-NEXT: s_ashr_i32 s9, s8, 31 -; GFX9-NEXT: s_add_i32 s8, s8, s9 +; GFX9-NEXT: v_readfirstlane_b32 s5, v4 +; GFX9-NEXT: s_ashr_i32 s9, s5, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_add_i32 s5, s5, s9 ; GFX9-NEXT: s_xor_b32 s4, s9, s4 -; GFX9-NEXT: s_xor_b32 s8, s8, s9 -; GFX9-NEXT: s_sub_i32 s9, 0, s6 +; GFX9-NEXT: s_xor_b32 s5, s5, s9 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v1 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: v_readfirstlane_b32 s5, v3 +; GFX9-NEXT: s_sub_i32 s9, 0, s6 +; GFX9-NEXT: v_readfirstlane_b32 s8, v1 +; GFX9-NEXT: v_readfirstlane_b32 s7, v5 ; GFX9-NEXT: v_readfirstlane_b32 s10, v0 ; GFX9-NEXT: s_mul_i32 s9, s9, s10 ; GFX9-NEXT: s_mul_hi_u32 s9, s10, s9 ; GFX9-NEXT: s_add_i32 s10, s10, s9 -; GFX9-NEXT: s_mul_hi_u32 s9, s8, s10 +; GFX9-NEXT: s_mul_hi_u32 s9, s5, s10 ; GFX9-NEXT: s_mul_i32 s10, s9, s6 -; GFX9-NEXT: s_sub_i32 s8, s8, s10 +; GFX9-NEXT: s_sub_i32 s5, s5, s10 ; GFX9-NEXT: s_add_i32 s11, s9, 1 -; GFX9-NEXT: s_sub_i32 s10, s8, s6 -; GFX9-NEXT: s_cmp_ge_u32 s8, s6 +; GFX9-NEXT: s_sub_i32 s10, s5, s6 +; GFX9-NEXT: s_cmp_ge_u32 s5, s6 ; GFX9-NEXT: s_cselect_b32 s9, s11, s9 -; GFX9-NEXT: s_cselect_b32 s8, s10, s8 +; GFX9-NEXT: s_cselect_b32 s5, s10, s5 ; GFX9-NEXT: s_add_i32 s10, s9, 1 -; GFX9-NEXT: s_cmp_ge_u32 s8, s6 -; GFX9-NEXT: s_cselect_b32 s6, s10, s9 -; GFX9-NEXT: s_ashr_i32 s8, s7, 31 -; GFX9-NEXT: s_add_i32 s7, s7, s8 -; GFX9-NEXT: s_xor_b32 s7, s7, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: v_readfirstlane_b32 s10, v5 -; GFX9-NEXT: s_ashr_i32 s11, s10, 31 -; GFX9-NEXT: s_xor_b32 s6, s6, s4 +; GFX9-NEXT: s_cmp_ge_u32 s5, s6 +; GFX9-NEXT: s_cselect_b32 s5, s10, s9 +; GFX9-NEXT: s_ashr_i32 s6, s8, 31 +; GFX9-NEXT: s_add_i32 s8, s8, s6 +; GFX9-NEXT: s_xor_b32 s8, s8, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX9-NEXT: s_ashr_i32 s11, s7, 31 +; GFX9-NEXT: s_xor_b32 s5, s5, s4 +; GFX9-NEXT: s_add_i32 s7, s7, s11 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_add_i32 s10, s10, s11 -; GFX9-NEXT: s_xor_b32 s8, s11, s8 -; GFX9-NEXT: s_sub_i32 s4, s6, s4 +; GFX9-NEXT: s_xor_b32 s6, s11, s6 +; GFX9-NEXT: s_sub_i32 s4, s5, s4 +; GFX9-NEXT: s_xor_b32 s5, s7, s11 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s6, s10, s11 -; GFX9-NEXT: s_sub_i32 s10, 0, s7 -; GFX9-NEXT: v_readfirstlane_b32 s9, v2 +; GFX9-NEXT: s_sub_i32 s7, 0, s8 +; GFX9-NEXT: v_readfirstlane_b32 s10, v2 +; GFX9-NEXT: v_readfirstlane_b32 s9, v6 ; GFX9-NEXT: v_readfirstlane_b32 s11, v0 -; GFX9-NEXT: s_mul_i32 s10, s10, s11 -; GFX9-NEXT: s_mul_hi_u32 s10, s11, s10 -; GFX9-NEXT: s_add_i32 s11, s11, s10 -; GFX9-NEXT: s_mul_hi_u32 s10, s6, s11 -; GFX9-NEXT: s_mul_i32 s11, s10, s7 -; GFX9-NEXT: s_sub_i32 s6, s6, s11 -; GFX9-NEXT: s_add_i32 s12, s10, 1 -; GFX9-NEXT: s_sub_i32 s11, s6, s7 -; GFX9-NEXT: s_cmp_ge_u32 s6, s7 -; GFX9-NEXT: s_cselect_b32 s10, s12, s10 -; GFX9-NEXT: s_cselect_b32 s6, s11, s6 -; GFX9-NEXT: s_add_i32 s11, s10, 1 -; GFX9-NEXT: s_cmp_ge_u32 s6, s7 -; GFX9-NEXT: s_cselect_b32 s6, s11, s10 -; GFX9-NEXT: s_ashr_i32 s7, s9, 31 -; GFX9-NEXT: s_add_i32 s9, s9, s7 -; GFX9-NEXT: s_xor_b32 s9, s9, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX9-NEXT: v_readfirstlane_b32 s11, v6 -; GFX9-NEXT: s_ashr_i32 s12, s11, 31 -; GFX9-NEXT: s_xor_b32 s6, s6, s8 +; GFX9-NEXT: s_mul_i32 s7, s7, s11 +; GFX9-NEXT: s_mul_hi_u32 s7, s11, s7 +; GFX9-NEXT: s_add_i32 s11, s11, s7 +; GFX9-NEXT: s_mul_hi_u32 s7, s5, s11 +; GFX9-NEXT: s_mul_i32 s11, s7, s8 +; GFX9-NEXT: s_sub_i32 s5, s5, s11 +; GFX9-NEXT: s_add_i32 s12, s7, 1 +; GFX9-NEXT: s_sub_i32 s11, s5, s8 +; GFX9-NEXT: s_cmp_ge_u32 s5, s8 +; GFX9-NEXT: s_cselect_b32 s7, s12, s7 +; GFX9-NEXT: s_cselect_b32 s5, s11, s5 +; GFX9-NEXT: s_add_i32 s11, s7, 1 +; GFX9-NEXT: s_cmp_ge_u32 s5, s8 +; GFX9-NEXT: s_cselect_b32 s5, s11, s7 +; GFX9-NEXT: s_ashr_i32 s7, s10, 31 +; GFX9-NEXT: s_add_i32 s10, s10, s7 +; GFX9-NEXT: s_xor_b32 s8, s10, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX9-NEXT: s_ashr_i32 s12, s9, 31 +; GFX9-NEXT: s_xor_b32 s5, s5, s6 +; GFX9-NEXT: s_add_i32 s9, s9, s12 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_add_i32 s11, s11, s12 ; GFX9-NEXT: s_xor_b32 s7, s12, s7 -; GFX9-NEXT: s_sub_i32 s6, s6, s8 +; GFX9-NEXT: s_sub_i32 s5, s5, s6 +; GFX9-NEXT: s_xor_b32 s6, s9, s12 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s8, s11, s12 -; GFX9-NEXT: s_sub_i32 s11, 0, s9 +; GFX9-NEXT: s_sub_i32 s9, 0, s8 +; GFX9-NEXT: v_readfirstlane_b32 s11, v3 ; GFX9-NEXT: v_readfirstlane_b32 s10, v7 ; GFX9-NEXT: v_readfirstlane_b32 s12, v0 -; GFX9-NEXT: s_mul_i32 s11, s11, s12 -; GFX9-NEXT: s_mul_hi_u32 s11, s12, s11 -; GFX9-NEXT: s_add_i32 s12, s12, s11 -; GFX9-NEXT: s_mul_hi_u32 s11, s8, s12 -; GFX9-NEXT: s_mul_i32 s12, s11, s9 -; GFX9-NEXT: s_sub_i32 s8, s8, s12 -; GFX9-NEXT: s_add_i32 s13, s11, 1 -; GFX9-NEXT: s_sub_i32 s12, s8, s9 -; GFX9-NEXT: s_cmp_ge_u32 s8, s9 -; GFX9-NEXT: s_cselect_b32 s11, s13, s11 -; GFX9-NEXT: s_cselect_b32 s8, s12, s8 -; GFX9-NEXT: s_add_i32 s12, s11, 1 -; GFX9-NEXT: s_cmp_ge_u32 s8, s9 -; GFX9-NEXT: s_cselect_b32 s8, s12, s11 -; GFX9-NEXT: s_ashr_i32 s9, s5, 31 -; GFX9-NEXT: s_add_i32 s5, s5, s9 -; GFX9-NEXT: s_xor_b32 s5, s5, s9 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s5 +; GFX9-NEXT: s_mul_i32 s9, s9, s12 +; GFX9-NEXT: s_mul_hi_u32 s9, s12, s9 +; GFX9-NEXT: s_add_i32 s12, s12, s9 +; GFX9-NEXT: s_mul_hi_u32 s9, s6, s12 +; GFX9-NEXT: s_mul_i32 s12, s9, s8 +; GFX9-NEXT: s_sub_i32 s6, s6, s12 +; GFX9-NEXT: s_add_i32 s13, s9, 1 +; GFX9-NEXT: s_sub_i32 s12, s6, s8 +; GFX9-NEXT: s_cmp_ge_u32 s6, s8 +; GFX9-NEXT: s_cselect_b32 s9, s13, s9 +; GFX9-NEXT: s_cselect_b32 s6, s12, s6 +; GFX9-NEXT: s_add_i32 s12, s9, 1 +; GFX9-NEXT: s_cmp_ge_u32 s6, s8 +; GFX9-NEXT: s_cselect_b32 s6, s12, s9 +; GFX9-NEXT: s_ashr_i32 s8, s11, 31 +; GFX9-NEXT: s_add_i32 s11, s11, s8 +; GFX9-NEXT: s_xor_b32 s9, s11, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s9 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_ashr_i32 s4, s10, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: s_xor_b32 s6, s8, s7 -; GFX9-NEXT: s_xor_b32 s8, s4, s9 -; GFX9-NEXT: s_sub_i32 s6, s6, s7 +; GFX9-NEXT: s_xor_b32 s5, s6, s7 +; GFX9-NEXT: s_xor_b32 s6, s4, s8 +; GFX9-NEXT: s_sub_i32 s5, s5, s7 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: s_sub_i32 s7, 0, s5 +; GFX9-NEXT: s_sub_i32 s7, 0, s9 ; GFX9-NEXT: s_add_i32 s10, s10, s4 ; GFX9-NEXT: s_xor_b32 s4, s10, s4 -; GFX9-NEXT: v_readfirstlane_b32 s9, v2 -; GFX9-NEXT: s_mul_i32 s7, s7, s9 -; GFX9-NEXT: s_mul_hi_u32 s7, s9, s7 -; GFX9-NEXT: s_add_i32 s9, s9, s7 -; GFX9-NEXT: s_mul_hi_u32 s7, s4, s9 -; GFX9-NEXT: s_mul_i32 s9, s7, s5 -; GFX9-NEXT: s_sub_i32 s4, s4, s9 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 +; GFX9-NEXT: s_mul_i32 s7, s7, s8 +; GFX9-NEXT: s_mul_hi_u32 s7, s8, s7 +; GFX9-NEXT: s_add_i32 s8, s8, s7 +; GFX9-NEXT: s_mul_hi_u32 s7, s4, s8 +; GFX9-NEXT: s_mul_i32 s8, s7, s9 +; GFX9-NEXT: s_sub_i32 s4, s4, s8 ; GFX9-NEXT: s_add_i32 s10, s7, 1 -; GFX9-NEXT: s_sub_i32 s9, s4, s5 -; GFX9-NEXT: s_cmp_ge_u32 s4, s5 +; GFX9-NEXT: s_sub_i32 s8, s4, s9 +; GFX9-NEXT: s_cmp_ge_u32 s4, s9 ; GFX9-NEXT: s_cselect_b32 s7, s10, s7 -; GFX9-NEXT: s_cselect_b32 s4, s9, s4 -; GFX9-NEXT: s_add_i32 s9, s7, 1 -; GFX9-NEXT: s_cmp_ge_u32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s9, s7 -; GFX9-NEXT: s_xor_b32 s4, s4, s8 -; GFX9-NEXT: s_sub_i32 s4, s4, s8 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_cselect_b32 s4, s8, s4 +; GFX9-NEXT: s_add_i32 s8, s7, 1 +; GFX9-NEXT: s_cmp_ge_u32 s4, s9 +; GFX9-NEXT: s_cselect_b32 s4, s8, s7 +; GFX9-NEXT: s_xor_b32 s4, s4, s6 +; GFX9-NEXT: s_sub_i32 s4, s4, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index b086640c72f80..38571865b3cfc 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -209,7 +209,7 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_mov_b64 s[14:15], s[8:9] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[20:21] ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 -; GCN-IR-NEXT: .LBB0_4: ; %Flow7 +; GCN-IR-NEXT: .LBB0_4: ; %Flow6 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[10:11], 1 ; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[6:7] ; GCN-IR-NEXT: .LBB0_5: ; %udiv-end @@ -461,12 +461,12 @@ define amdgpu_kernel void @s_test_sdiv24_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-LABEL: s_test_sdiv24_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 40 +; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 40 @@ -491,12 +491,12 @@ define amdgpu_kernel void @s_test_sdiv24_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-LABEL: s_test_sdiv24_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 40 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 40 @@ -684,7 +684,7 @@ define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv31_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s3, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -731,7 +731,7 @@ define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv31_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -786,12 +786,12 @@ define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-LABEL: s_test_sdiv23_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 41 +; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 41 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 41 @@ -816,12 +816,12 @@ define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-LABEL: s_test_sdiv23_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 41 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 41 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 41 @@ -852,7 +852,7 @@ define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv25_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s3, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -899,7 +899,7 @@ define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_sdiv25_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index c9dbadcbd2315..8b54715dace72 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -309,8 +309,8 @@ define amdgpu_kernel void @mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) ; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, v4, v2 -; NOSDWA-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v2, v4, v2 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; NOSDWA-NEXT: v_or_b32_e32 v2, v3, v2 @@ -399,11 +399,11 @@ define amdgpu_kernel void @mul_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; NOSDWA-NEXT: v_mov_b32_e32 v5, s5 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) ; NOSDWA-NEXT: v_mul_lo_u16_e32 v6, v1, v3 -; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v7, v0, v2 -; NOSDWA-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v1, v1, v3 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v0, v0, v2 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -500,17 +500,17 @@ define amdgpu_kernel void @mul_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; NOSDWA-NEXT: v_mov_b32_e32 v9, s5 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) ; NOSDWA-NEXT: v_mul_lo_u16_e32 v10, v3, v7 -; NOSDWA-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v11, v2, v6 -; NOSDWA-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v12, v1, v5 -; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v13, v0, v4 -; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, v3, v7 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v2, v2, v6 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v1, v1, v5 @@ -611,9 +611,9 @@ define amdgpu_kernel void @mul_half(ptr addrspace(1) %out, ptr addrspace(1) %ina ; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 ; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 -; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 ; NOSDWA-NEXT: flat_load_ushort v4, v[0:1] ; NOSDWA-NEXT: flat_load_ushort v2, v[2:3] ; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 @@ -629,9 +629,9 @@ define amdgpu_kernel void @mul_half(ptr addrspace(1) %out, ptr addrspace(1) %ina ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 +; GFX89-NEXT: v_mov_b32_e32 v3, s1 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 ; GFX89-NEXT: v_mov_b32_e32 v2, s0 -; GFX89-NEXT: v_mov_b32_e32 v3, s1 ; GFX89-NEXT: flat_load_ushort v4, v[0:1] ; GFX89-NEXT: flat_load_ushort v2, v[2:3] ; GFX89-NEXT: v_mov_b32_e32 v0, s4 @@ -683,8 +683,8 @@ define amdgpu_kernel void @mul_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i ; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 -; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 ; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 +; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 ; NOSDWA-NEXT: flat_load_dword v2, v[2:3] ; NOSDWA-NEXT: flat_load_dword v3, v[0:1] @@ -707,9 +707,9 @@ define amdgpu_kernel void @mul_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 +; GFX89-NEXT: v_mov_b32_e32 v3, s1 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 ; GFX89-NEXT: v_mov_b32_e32 v2, s0 -; GFX89-NEXT: v_mov_b32_e32 v3, s1 ; GFX89-NEXT: flat_load_dword v4, v[0:1] ; GFX89-NEXT: flat_load_dword v2, v[2:3] ; GFX89-NEXT: v_mov_b32_e32 v0, s4 @@ -763,9 +763,9 @@ define amdgpu_kernel void @mul_v4half(ptr addrspace(1) %out, ptr addrspace(1) %i ; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 ; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 -; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 ; NOSDWA-NEXT: flat_load_dwordx2 v[2:3], v[2:3] ; NOSDWA-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; NOSDWA-NEXT: v_mov_b32_e32 v4, s4 @@ -793,9 +793,9 @@ define amdgpu_kernel void @mul_v4half(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 +; GFX89-NEXT: v_mov_b32_e32 v3, s1 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 ; GFX89-NEXT: v_mov_b32_e32 v2, s0 -; GFX89-NEXT: v_mov_b32_e32 v3, s1 ; GFX89-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX89-NEXT: flat_load_dwordx2 v[2:3], v[2:3] ; GFX89-NEXT: v_mov_b32_e32 v4, s4 @@ -854,8 +854,8 @@ define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %i ; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v4, s6 -; NOSDWA-NEXT: v_mov_b32_e32 v5, s7 ; NOSDWA-NEXT: v_mov_b32_e32 v0, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v5, s7 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 ; NOSDWA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; NOSDWA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -896,9 +896,9 @@ define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 +; GFX89-NEXT: v_mov_b32_e32 v5, s1 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 ; GFX89-NEXT: v_mov_b32_e32 v4, s0 -; GFX89-NEXT: v_mov_b32_e32 v5, s1 ; GFX89-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX89-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GFX89-NEXT: v_mov_b32_e32 v8, s4 @@ -1523,8 +1523,8 @@ define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i ; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 -; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 ; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 +; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 ; NOSDWA-NEXT: flat_load_dword v2, v[2:3] ; NOSDWA-NEXT: flat_load_dword v3, v[0:1] @@ -1547,8 +1547,8 @@ define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 -; GFX89-NEXT: v_mov_b32_e32 v2, s0 ; GFX89-NEXT: v_mov_b32_e32 v3, s1 +; GFX89-NEXT: v_mov_b32_e32 v2, s0 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 ; GFX89-NEXT: flat_load_dword v2, v[2:3] ; GFX89-NEXT: flat_load_dword v3, v[0:1] @@ -1689,18 +1689,18 @@ define amdgpu_kernel void @mulmul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 ; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; NOSDWA-NEXT: flat_load_dword v4, v[0:1] ; NOSDWA-NEXT: flat_load_dword v2, v[2:3] -; NOSDWA-NEXT: flat_load_dword v3, v[0:1] ; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 ; NOSDWA-NEXT: s_waitcnt vmcnt(1) -; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v4 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) -; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, v3, v2 -; NOSDWA-NEXT: v_mul_lo_u16_e32 v5, v5, v4 -; NOSDWA-NEXT: v_mul_lo_u16_e32 v2, v3, v2 -; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, v5, v4 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, v3, v5 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v4, v4, v2 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, v3, v5 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v2, v4, v2 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; NOSDWA-NEXT: v_or_b32_e32 v2, v2, v3 ; NOSDWA-NEXT: flat_store_dword v[0:1], v2 @@ -1783,9 +1783,9 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 ; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 -; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 ; NOSDWA-NEXT: flat_load_dword v1, v[0:1] ; NOSDWA-NEXT: flat_load_dword v2, v[2:3] ; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 @@ -1808,9 +1808,9 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 +; GFX89-NEXT: v_mov_b32_e32 v3, s1 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 ; GFX89-NEXT: v_mov_b32_e32 v2, s0 -; GFX89-NEXT: v_mov_b32_e32 v3, s1 ; GFX89-NEXT: flat_load_dword v1, v[0:1] ; GFX89-NEXT: flat_load_dword v2, v[2:3] ; GFX89-NEXT: v_mov_b32_e32 v0, s4 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll index b67ecc2f9d13c..2f400dd725710 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -12,11 +12,12 @@ define amdgpu_kernel void @sgpr_if_else_salu_br(ptr addrspace(1) %out, i32 %a, i ; SI-LABEL: sgpr_if_else_salu_br: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dword s2, s[0:1], 0xf ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: s_cbranch_scc0 .LBB0_4 ; SI-NEXT: ; %bb.1: ; %else +; SI-NEXT: s_load_dword s2, s[0:1], 0xf +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_i32 s7, s7, s2 ; SI-NEXT: s_cbranch_execnz .LBB0_3 ; SI-NEXT: .LBB0_2: ; %if @@ -158,45 +159,47 @@ endif: define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SI-LABEL: sgpr_if_else_valu_cmp_phi_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: ; implicit-def: $sgpr8_sgpr9 -; SI-NEXT: s_and_saveexec_b64 s[10:11], vcc -; SI-NEXT: s_xor_b64 s[10:11], exec, s[10:11] +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc +; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; SI-NEXT: s_cbranch_execz .LBB3_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v0 -; SI-NEXT: s_and_b64 s[8:9], vcc, exec +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: .LBB3_2: ; %Flow -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_andn2_saveexec_b64 s[0:1], s[10:11] +; SI-NEXT: s_or_saveexec_b64 s[6:7], s[2:3] +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_xor_b64 exec, exec, s[6:7] ; SI-NEXT: s_cbranch_execz .LBB3_4 ; SI-NEXT: ; %bb.3: ; %if -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, 0 -; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 -; SI-NEXT: s_andn2_b64 s[2:3], s[8:9], exec +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_andn2_b64 s[2:3], s[4:5], exec ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; SI-NEXT: s_and_b64 s[6:7], vcc, exec -; SI-NEXT: s_or_b64 s[8:9], s[2:3], s[6:7] +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_or_b64 s[4:5], s[2:3], s[4:5] ; SI-NEXT: .LBB3_4: ; %endif -; SI-NEXT: s_or_b64 exec, exec, s[0:1] -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[8:9] -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_or_b64 exec, exec, s[6:7] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll index 4b02d00ddce1e..bf3ebedd45016 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -181,29 +181,30 @@ define i128 @v_ashr_i128_kv(i128 %rhs) { define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_shl_i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; GCN-NEXT: s_load_dword s10, s[4:5], 0x4 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sub_i32 s5, s4, 64 -; GCN-NEXT: s_sub_i32 s12, 64, s4 -; GCN-NEXT: s_lshl_b64 s[6:7], s[2:3], s4 -; GCN-NEXT: s_lshl_b64 s[8:9], s[0:1], s4 -; GCN-NEXT: s_lshl_b64 s[10:11], s[0:1], s5 -; GCN-NEXT: s_lshr_b64 s[0:1], s[0:1], s12 -; GCN-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] -; GCN-NEXT: s_cmp_lt_u32 s4, 64 -; GCN-NEXT: s_cselect_b32 s0, s0, s10 -; GCN-NEXT: s_cselect_b32 s1, s1, s11 -; GCN-NEXT: s_cselect_b32 s5, s9, 0 -; GCN-NEXT: s_cselect_b32 s6, s8, 0 -; GCN-NEXT: s_cmp_eq_u32 s4, 0 -; GCN-NEXT: s_cselect_b32 s1, s3, s1 -; GCN-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: s_sub_i32 s8, 64, s10 +; GCN-NEXT: s_sub_i32 s6, s10, 64 +; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], s10 +; GCN-NEXT: s_lshr_b64 s[8:9], s[0:1], s8 +; GCN-NEXT: s_lshl_b64 s[6:7], s[0:1], s6 +; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], s10 +; GCN-NEXT: s_cmp_lt_u32 s10, 64 +; GCN-NEXT: s_cselect_b32 s4, s4, s6 +; GCN-NEXT: s_cselect_b32 s5, s5, s7 +; GCN-NEXT: s_cselect_b32 s1, s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s0, 0 +; GCN-NEXT: s_cmp_eq_u32 s10, 0 +; GCN-NEXT: s_cselect_b32 s3, s3, s5 +; GCN-NEXT: s_cselect_b32 s2, s2, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm %shift = shl i128 %lhs, %rhs @@ -214,29 +215,30 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) { define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_lshr_i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; GCN-NEXT: s_load_dword s10, s[4:5], 0x4 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sub_i32 s5, s4, 64 -; GCN-NEXT: s_sub_i32 s12, 64, s4 -; GCN-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 -; GCN-NEXT: s_lshr_b64 s[8:9], s[2:3], s4 -; GCN-NEXT: s_lshr_b64 s[10:11], s[2:3], s5 -; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], s12 -; GCN-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3] -; GCN-NEXT: s_cmp_lt_u32 s4, 64 -; GCN-NEXT: s_cselect_b32 s2, s2, s10 -; GCN-NEXT: s_cselect_b32 s3, s3, s11 -; GCN-NEXT: s_cselect_b32 s5, s9, 0 -; GCN-NEXT: s_cselect_b32 s6, s8, 0 -; GCN-NEXT: s_cmp_eq_u32 s4, 0 -; GCN-NEXT: s_cselect_b32 s1, s1, s3 -; GCN-NEXT: s_cselect_b32 s0, s0, s2 +; GCN-NEXT: s_sub_i32 s8, 64, s10 +; GCN-NEXT: s_sub_i32 s6, s10, 64 +; GCN-NEXT: s_lshr_b64 s[4:5], s[0:1], s10 +; GCN-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 +; GCN-NEXT: s_lshr_b64 s[6:7], s[2:3], s6 +; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 +; GCN-NEXT: s_cmp_lt_u32 s10, 64 +; GCN-NEXT: s_cselect_b32 s4, s4, s6 +; GCN-NEXT: s_cselect_b32 s5, s5, s7 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 +; GCN-NEXT: s_cmp_eq_u32 s10, 0 +; GCN-NEXT: s_cselect_b32 s1, s1, s5 +; GCN-NEXT: s_cselect_b32 s0, s0, s4 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm %shift = lshr i128 %lhs, %rhs @@ -247,26 +249,27 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) { define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_ashr_i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; GCN-NEXT: s_load_dword s10, s[4:5], 0x4 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sub_i32 s5, 64, s4 -; GCN-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 -; GCN-NEXT: s_sub_i32 s10, s4, 64 -; GCN-NEXT: s_lshl_b64 s[8:9], s[2:3], s5 -; GCN-NEXT: s_ashr_i32 s12, s3, 31 -; GCN-NEXT: s_ashr_i64 s[10:11], s[2:3], s10 -; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], s4 -; GCN-NEXT: s_cmp_lt_u32 s4, 64 -; GCN-NEXT: s_cselect_b32 s3, s3, s12 -; GCN-NEXT: s_cselect_b32 s2, s2, s12 -; GCN-NEXT: s_cselect_b32 s5, s6, s10 -; GCN-NEXT: s_cselect_b32 s6, s7, s11 -; GCN-NEXT: s_cmp_eq_u32 s4, 0 -; GCN-NEXT: s_cselect_b32 s1, s1, s6 -; GCN-NEXT: s_cselect_b32 s0, s0, s5 +; GCN-NEXT: s_sub_i32 s6, 64, s10 +; GCN-NEXT: s_lshr_b64 s[4:5], s[0:1], s10 +; GCN-NEXT: s_sub_i32 s8, s10, 64 +; GCN-NEXT: s_lshl_b64 s[6:7], s[2:3], s6 +; GCN-NEXT: s_ashr_i64 s[8:9], s[2:3], s8 +; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GCN-NEXT: s_ashr_i32 s6, s3, 31 +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], s10 +; GCN-NEXT: s_cmp_lt_u32 s10, 64 +; GCN-NEXT: s_cselect_b32 s3, s3, s6 +; GCN-NEXT: s_cselect_b32 s2, s2, s6 +; GCN-NEXT: s_cselect_b32 s4, s4, s8 +; GCN-NEXT: s_cselect_b32 s5, s5, s9 +; GCN-NEXT: s_cmp_eq_u32 s10, 0 +; GCN-NEXT: s_cselect_b32 s1, s1, s5 +; GCN-NEXT: s_cselect_b32 s0, s0, s4 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 @@ -430,25 +433,26 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_shl_v2i128ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_lt_u64_e64 s[16:17], s[8:9], 64 ; GCN-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0 ; GCN-NEXT: s_sub_i32 s22, 64, s8 ; GCN-NEXT: s_sub_i32 s20, s8, 64 -; GCN-NEXT: s_lshr_b64 s[22:23], s[0:1], s22 ; GCN-NEXT: s_and_b64 s[16:17], s[18:19], s[16:17] -; GCN-NEXT: s_lshl_b64 s[18:19], s[2:3], s8 +; GCN-NEXT: s_lshr_b64 s[18:19], s[0:1], s22 +; GCN-NEXT: s_lshl_b64 s[22:23], s[2:3], s8 ; GCN-NEXT: s_lshl_b64 s[20:21], s[0:1], s20 -; GCN-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23] +; GCN-NEXT: s_or_b64 s[18:19], s[22:23], s[18:19] ; GCN-NEXT: s_and_b64 s[22:23], s[16:17], exec ; GCN-NEXT: s_cselect_b32 s19, s19, s21 ; GCN-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] ; GCN-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_and_b64 s[22:23], s[10:11], exec ; GCN-NEXT: s_cselect_b32 s9, s3, s19 ; GCN-NEXT: s_and_b64 s[22:23], s[16:17], exec @@ -502,25 +506,26 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_lshr_v2i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_lt_u64_e64 s[16:17], s[8:9], 64 ; GCN-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0 ; GCN-NEXT: s_sub_i32 s22, 64, s8 ; GCN-NEXT: s_sub_i32 s20, s8, 64 -; GCN-NEXT: s_lshl_b64 s[22:23], s[2:3], s22 ; GCN-NEXT: s_and_b64 s[16:17], s[18:19], s[16:17] -; GCN-NEXT: s_lshr_b64 s[18:19], s[0:1], s8 +; GCN-NEXT: s_lshl_b64 s[18:19], s[2:3], s22 +; GCN-NEXT: s_lshr_b64 s[22:23], s[0:1], s8 ; GCN-NEXT: s_lshr_b64 s[20:21], s[2:3], s20 -; GCN-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23] +; GCN-NEXT: s_or_b64 s[18:19], s[22:23], s[18:19] ; GCN-NEXT: s_and_b64 s[22:23], s[16:17], exec ; GCN-NEXT: s_cselect_b32 s19, s19, s21 ; GCN-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] ; GCN-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_and_b64 s[22:23], s[10:11], exec ; GCN-NEXT: s_cselect_b32 s9, s1, s19 ; GCN-NEXT: s_and_b64 s[22:23], s[16:17], exec @@ -574,25 +579,26 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_ashr_v2i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_lt_u64_e64 s[16:17], s[8:9], 64 ; GCN-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0 ; GCN-NEXT: s_sub_i32 s22, 64, s8 ; GCN-NEXT: s_sub_i32 s20, s8, 64 -; GCN-NEXT: s_lshl_b64 s[22:23], s[2:3], s22 ; GCN-NEXT: s_and_b64 s[16:17], s[18:19], s[16:17] -; GCN-NEXT: s_lshr_b64 s[18:19], s[0:1], s8 +; GCN-NEXT: s_lshl_b64 s[18:19], s[2:3], s22 +; GCN-NEXT: s_lshr_b64 s[22:23], s[0:1], s8 ; GCN-NEXT: s_ashr_i64 s[20:21], s[2:3], s20 -; GCN-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23] +; GCN-NEXT: s_or_b64 s[18:19], s[22:23], s[18:19] ; GCN-NEXT: s_and_b64 s[22:23], s[16:17], exec ; GCN-NEXT: s_cselect_b32 s19, s19, s21 ; GCN-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] ; GCN-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_and_b64 s[22:23], s[10:11], exec ; GCN-NEXT: s_cselect_b32 s9, s1, s19 ; GCN-NEXT: s_and_b64 s[22:23], s[16:17], exec diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll index b3f4790df4d48..d6257a71e5b20 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -931,7 +931,8 @@ define amdgpu_kernel void @shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; VI-NEXT: s_load_dwordx8 s[0:7], s[18:19], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[18:19], 0x20 ; VI-NEXT: s_mov_b32 s19, 0xf000 ; VI-NEXT: s_mov_b32 s18, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll index 3176257920a7a..3cd89b44615ab 100644 --- a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll +++ b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll @@ -3,23 +3,32 @@ define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspace(4) %wei_ptr, ptr addrspace(1) %out_ptr, ptr addrspace(1) %in) { ; CHECK-LABEL: excess_soft_clause_reg_pressure: ; CHECK: BB0_1: ; %for.cond28.preheader -; CHECK: s_load_dwordx16 -; CHECK-NEXT: s_load_dwordx16 +; CHECK: s_load_dwordx8 +; CHECK-NEXT: s_load_dwordx8 +; CHECK-NEXT: s_load_dwordx8 +; CHECK-NEXT: s_load_dwordx8 ; CHECK: global_load_dword ; CHECK-NEXT: global_load_dword ; CHECK-NEXT: global_load_dword ; CHECK-NEXT: global_load_dword -; CHECK: s_load_dwordx16 -; CHECK-NEXT: s_load_dwordx16 +; CHECK: s_load_dwordx8 +; CHECK-NEXT: s_load_dwordx8 +; CHECK-NEXT: s_load_dwordx8 +; CHECK-NEXT: s_load_dwordx8 ; CHECK-NOT: v_writelane_b32 ; CHECK-NOT: v_readlane_b32 -; CHECK: s_load_dwordx16 -; CHECK: s_load_dwordx16 -; CHECK: s_load_dwordx16 +; CHECK: s_load_dwordx8 +; CHECK-NEXT: s_load_dwordx8 +; CHECK-NEXT: s_load_dwordx8 +; CHECK-NEXT: s_load_dwordx8 +; CHECK: s_load_dwordx8 +; CHECK-NEXT: s_load_dwordx8 +; CHECK-NEXT: s_load_dwordx8 +; CHECK-NEXT: s_load_dwordx8 ; CHECK-NOT: v_writelane_b32 ; CHECK-NOT: v_readlane_b32 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index ed7f27b367fda..367f960b04ff9 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -181,7 +181,7 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_mov_b64 s[14:15], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 -; GCN-IR-NEXT: .LBB0_4: ; %Flow7 +; GCN-IR-NEXT: .LBB0_4: ; %Flow6 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 ; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] ; GCN-IR-NEXT: .LBB0_5: ; %udiv-end @@ -443,12 +443,12 @@ define amdgpu_kernel void @s_test_srem23_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-LABEL: s_test_srem23_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 41 +; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 41 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 41 @@ -478,12 +478,12 @@ define amdgpu_kernel void @s_test_srem23_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-LABEL: s_test_srem23_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 41 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 41 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 41 @@ -520,12 +520,12 @@ define amdgpu_kernel void @s_test_srem24_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-LABEL: s_test_srem24_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 40 +; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 40 @@ -555,12 +555,12 @@ define amdgpu_kernel void @s_test_srem24_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-LABEL: s_test_srem24_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 40 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 40 @@ -650,7 +650,7 @@ define i64 @v_test_srem24_64(i64 %x, i64 %y) { define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem25_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s3, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -694,7 +694,7 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem25_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -745,7 +745,7 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem31_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s3, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -789,7 +789,7 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 % ; ; GCN-IR-LABEL: s_test_srem31_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -1138,7 +1138,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_mov_b64 s[16:17], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[20:21] ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_3 -; GCN-IR-NEXT: .LBB8_4: ; %Flow7 +; GCN-IR-NEXT: .LBB8_4: ; %Flow6 ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 ; GCN-IR-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] ; GCN-IR-NEXT: .LBB8_5: ; %udiv-end diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll index 418c160d4244a..85bea6d66e3a0 100644 --- a/llvm/test/CodeGen/AMDGPU/srl.ll +++ b/llvm/test/CodeGen/AMDGPU/srl.ll @@ -286,7 +286,8 @@ define amdgpu_kernel void @lshr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; VI-NEXT: s_load_dwordx8 s[0:7], s[18:19], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[18:19], 0x20 ; VI-NEXT: s_mov_b32 s19, 0xf000 ; VI-NEXT: s_mov_b32 s18, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll index ded308ae4f230..8bba91a180a08 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -330,12 +330,12 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_add_u32 s4, s2, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: s_addc_u32 s5, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_add_u32 s2, s2, 16 -; GFX8-NEXT: s_addc_u32 s3, s3, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GFX8-NEXT: v_mov_b32_e32 v8, s0 @@ -353,13 +353,13 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16 -; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v3, v7, v3 -; GFX9-NEXT: v_sub_u32_e32 v2, v6, v2 -; GFX9-NEXT: v_sub_u32_e32 v1, v5, v1 -; GFX9-NEXT: v_sub_u32_e32 v0, v4, v0 +; GFX9-NEXT: v_sub_u32_e32 v3, v3, v7 +; GFX9-NEXT: v_sub_u32_e32 v2, v2, v6 +; GFX9-NEXT: v_sub_u32_e32 v1, v1, v5 +; GFX9-NEXT: v_sub_u32_e32 v0, v0, v4 ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -369,13 +369,13 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_load_b128 v[0:3], v8, s[2:3] offset:16 -; GFX12-NEXT: global_load_b128 v[4:7], v8, s[2:3] +; GFX12-NEXT: global_load_b128 v[0:3], v8, s[2:3] +; GFX12-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_sub_nc_u32_e32 v3, v7, v3 -; GFX12-NEXT: v_sub_nc_u32_e32 v2, v6, v2 -; GFX12-NEXT: v_sub_nc_u32_e32 v1, v5, v1 -; GFX12-NEXT: v_sub_nc_u32_e32 v0, v4, v0 +; GFX12-NEXT: v_sub_nc_u32_e32 v3, v3, v7 +; GFX12-NEXT: v_sub_nc_u32_e32 v2, v2, v6 +; GFX12-NEXT: v_sub_nc_u32_e32 v1, v1, v5 +; GFX12-NEXT: v_sub_nc_u32_e32 v0, v0, v4 ; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll index 6ec213a06999b..97081860abfb0 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -127,8 +127,8 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace ; VI-NEXT: s_lshr_b32 s5, s7, 16 ; VI-NEXT: s_sub_i32 s6, s6, s7 ; VI-NEXT: s_sub_i32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s6, 0xffff ; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: s_and_b32 s5, s6, 0xffff ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll index c0c56ebb16610..413f4df91fdf3 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll @@ -111,11 +111,11 @@ define amdgpu_kernel void @truncate_high_elt_extract_vector(ptr addrspace(1) noc ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_sext_i32_i16 s0, s2 -; VI-NEXT: s_sext_i32_i16 s1, s3 -; VI-NEXT: s_mul_i32 s1, s1, s0 -; VI-NEXT: s_lshr_b32 s0, s1, 16 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_sext_i32_i16 s2, s2 +; VI-NEXT: s_sext_i32_i16 s3, s3 +; VI-NEXT: s_mul_i32 s3, s3, s2 +; VI-NEXT: s_lshr_b32 s2, s3, 16 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll index 416dbb226422c..47cb82b594779 100644 --- a/llvm/test/CodeGen/AMDGPU/uaddo.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll @@ -706,83 +706,88 @@ exit: define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_uaddo_clamp_bit: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_mov_b32 s15, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s8 -; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: s_mov_b32 s4, s8 +; SI-NEXT: s_mov_b32 s5, s9 ; SI-NEXT: s_mov_b32 s12, s10 ; SI-NEXT: s_mov_b32 s13, s11 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e64 v0, s[0:1], v1, v2 +; SI-NEXT: v_add_i32_e64 v0, s[2:3], v1, v2 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_cbranch_vccnz .LBB9_2 ; SI-NEXT: ; %bb.1: ; %if -; SI-NEXT: s_xor_b64 s[8:9], s[0:1], -1 +; SI-NEXT: s_xor_b64 s[4:5], s[2:3], -1 ; SI-NEXT: .LBB9_2: ; %exit -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9] -; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_uaddo_clamp_bit: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; VI-NEXT: s_mov_b64 s[2:3], 0 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: flat_load_dword v1, v[0:1] ; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; VI-NEXT: v_add_u32_e64 v0, s[0:1], v1, v2 +; VI-NEXT: v_add_u32_e64 v0, s[2:3], v1, v2 ; VI-NEXT: s_cbranch_vccnz .LBB9_2 ; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: s_xor_b64 s[2:3], s[0:1], -1 +; VI-NEXT: s_xor_b64 s[4:5], s[2:3], -1 ; VI-NEXT: .LBB9_2: ; %exit -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_mov_b32_e32 v3, s6 -; VI-NEXT: v_mov_b32_e32 v4, s7 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_mov_b32_e32 v4, s3 ; VI-NEXT: flat_store_dword v[1:2], v0 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; VI-NEXT: flat_store_byte v[3:4], v0 +; VI-NEXT: flat_store_byte v[3:4], v5 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_uaddo_clamp_bit: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v2, v3 +; GFX9-NEXT: v_add_co_u32_e64 v1, s[2:3], v2, v3 ; GFX9-NEXT: s_cbranch_vccnz .LBB9_2 ; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: s_xor_b64 s[2:3], s[0:1], -1 +; GFX9-NEXT: s_xor_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: .LBB9_2: ; %exit -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] -; GFX9-NEXT: global_store_byte v0, v1, s[6:7] +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v2, s[2:3] ; GFX9-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index 48b9c72ea6892..c690a54138fd7 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -182,7 +182,7 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17] ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 -; GCN-IR-NEXT: .LBB0_4: ; %Flow7 +; GCN-IR-NEXT: .LBB0_4: ; %Flow6 ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], 1 ; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3] ; GCN-IR-NEXT: .LBB0_5: ; %udiv-end diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index f35589853393c..727cc72220903 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -181,7 +181,7 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_mov_b64 s[14:15], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 -; GCN-IR-NEXT: .LBB0_4: ; %Flow7 +; GCN-IR-NEXT: .LBB0_4: ; %Flow6 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 ; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] ; GCN-IR-NEXT: .LBB0_5: ; %udiv-end diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll index 666ae7c126ae3..59ac1a36af931 100644 --- a/llvm/test/CodeGen/AMDGPU/usubo.ll +++ b/llvm/test/CodeGen/AMDGPU/usubo.ll @@ -707,83 +707,88 @@ exit: define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_usubo_clamp_bit: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_mov_b32 s15, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s8 -; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: s_mov_b32 s4, s8 +; SI-NEXT: s_mov_b32 s5, s9 ; SI-NEXT: s_mov_b32 s12, s10 ; SI-NEXT: s_mov_b32 s13, s11 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_sub_i32_e64 v0, s[0:1], v1, v2 +; SI-NEXT: v_sub_i32_e64 v0, s[2:3], v1, v2 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_cbranch_vccnz .LBB9_2 ; SI-NEXT: ; %bb.1: ; %if -; SI-NEXT: s_xor_b64 s[8:9], s[0:1], -1 +; SI-NEXT: s_xor_b64 s[4:5], s[2:3], -1 ; SI-NEXT: .LBB9_2: ; %exit -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9] -; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_usubo_clamp_bit: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; VI-NEXT: s_mov_b64 s[2:3], 0 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: flat_load_dword v1, v[0:1] ; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; VI-NEXT: v_sub_u32_e64 v0, s[0:1], v1, v2 +; VI-NEXT: v_sub_u32_e64 v0, s[2:3], v1, v2 ; VI-NEXT: s_cbranch_vccnz .LBB9_2 ; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: s_xor_b64 s[2:3], s[0:1], -1 +; VI-NEXT: s_xor_b64 s[4:5], s[2:3], -1 ; VI-NEXT: .LBB9_2: ; %exit -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_mov_b32_e32 v3, s6 -; VI-NEXT: v_mov_b32_e32 v4, s7 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_mov_b32_e32 v4, s3 ; VI-NEXT: flat_store_dword v[1:2], v0 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; VI-NEXT: flat_store_byte v[3:4], v0 +; VI-NEXT: flat_store_byte v[3:4], v5 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_usubo_clamp_bit: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX9-NEXT: v_sub_co_u32_e64 v1, s[0:1], v2, v3 +; GFX9-NEXT: v_sub_co_u32_e64 v1, s[2:3], v2, v3 ; GFX9-NEXT: s_cbranch_vccnz .LBB9_2 ; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: s_xor_b64 s[2:3], s[0:1], -1 +; GFX9-NEXT: s_xor_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: .LBB9_2: ; %exit -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] -; GFX9-NEXT: global_store_byte v0, v1, s[6:7] +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v2, s[2:3] ; GFX9-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll index 66c49ba8b734d..e24c407123157 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -4237,157 +4237,157 @@ define <6 x bfloat> @shuffle_v6bf16_452367(ptr addrspace(1) %arg0, ptr addrspace define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C) { ; GFX9-LABEL: fma_shuffle_v2bf16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_movk_i32 s2, 0x7fff ; GFX9-NEXT: s_mov_b32 s3, 0x7060302 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[0:1] -; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[8:9] +; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[8:9] +; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[0:1] ; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_fma_f32 v7, v8, v9, v7 -; GFX9-NEXT: v_fma_f32 v1, v8, v5, v1 -; GFX9-NEXT: v_fma_f32 v2, v12, v5, v2 -; GFX9-NEXT: v_bfe_u32 v5, v7, 16, 1 -; GFX9-NEXT: v_fma_f32 v8, v12, v9, v11 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v7 -; GFX9-NEXT: v_bfe_u32 v11, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v5, v5, v7, s2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v1 -; GFX9-NEXT: v_bfe_u32 v13, v8, 16, 1 -; GFX9-NEXT: v_add3_u32 v11, v11, v1, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v8 -; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v13, v13, v8, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_fma_f32 v8, v7, v9, v8 +; GFX9-NEXT: v_fma_f32 v3, v7, v5, v3 +; GFX9-NEXT: v_fma_f32 v4, v11, v5, v4 +; GFX9-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX9-NEXT: v_fma_f32 v7, v11, v9, v12 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v8 +; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v5, v5, v8, s2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v3 +; GFX9-NEXT: v_bfe_u32 v13, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v11, v11, v3, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v7 +; GFX9-NEXT: v_bfe_u32 v15, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v13, v13, v7, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v12, vcc +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v2 -; GFX9-NEXT: v_add3_u32 v15, v15, v2, s2 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v4 +; GFX9-NEXT: v_add3_u32 v15, v15, v4, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v15, v16, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v15, v16, vcc ; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_fma_f32 v1, v3, v10, v1 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_fma_f32 v3, v1, v10, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_fma_f32 v3, v3, v6, v5 -; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX9-NEXT: v_fma_f32 v2, v4, v10, v2 -; GFX9-NEXT: v_fma_f32 v4, v4, v6, v7 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX9-NEXT: v_add3_u32 v5, v5, v1, s2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX9-NEXT: v_bfe_u32 v9, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v7, v7, v3, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX9-NEXT: v_fma_f32 v1, v1, v6, v5 +; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX9-NEXT: v_fma_f32 v4, v2, v10, v4 +; GFX9-NEXT: v_fma_f32 v2, v2, v6, v7 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v5, v5, v3, s2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v2 -; GFX9-NEXT: v_bfe_u32 v11, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v9, v9, v2, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v4 -; GFX9-NEXT: v_add3_u32 v11, v11, v4, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX9-NEXT: v_bfe_u32 v9, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v7, v7, v1, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX9-NEXT: v_bfe_u32 v11, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v4, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v11, v12, vcc -; GFX9-NEXT: v_perm_b32 v2, v4, v2, s3 -; GFX9-NEXT: v_perm_b32 v1, v3, v1, s3 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v2 +; GFX9-NEXT: v_add3_u32 v11, v11, v2, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v11, v12, vcc +; GFX9-NEXT: v_perm_b32 v2, v2, v4, s3 +; GFX9-NEXT: v_perm_b32 v1, v1, v3, s3 ; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: fma_shuffle_v2bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[8:9] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] ; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[10:11] ; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_fmac_f32_e32 v7, v8, v9 -; GFX10-NEXT: v_fmac_f32_e32 v0, v8, v4 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v9 -; GFX10-NEXT: v_fmac_f32_e32 v1, v12, v4 -; GFX10-NEXT: v_bfe_u32 v4, v7, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v7 -; GFX10-NEXT: v_bfe_u32 v9, v0, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v0 -; GFX10-NEXT: v_add3_u32 v4, v4, v7, 0x7fff -; GFX10-NEXT: v_bfe_u32 v15, v1, 16, 1 -; GFX10-NEXT: v_add3_u32 v9, v9, v0, 0x7fff -; GFX10-NEXT: v_bfe_u32 v13, v11, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_add3_u32 v15, v15, v1, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_fmac_f32_e32 v8, v7, v9 +; GFX10-NEXT: v_fmac_f32_e32 v2, v7, v4 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_fmac_f32_e32 v12, v11, v9 +; GFX10-NEXT: v_fmac_f32_e32 v3, v11, v4 +; GFX10-NEXT: v_bfe_u32 v4, v8, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX10-NEXT: v_bfe_u32 v9, v2, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v2 +; GFX10-NEXT: v_add3_u32 v4, v4, v8, 0x7fff +; GFX10-NEXT: v_bfe_u32 v15, v3, 16, 1 +; GFX10-NEXT: v_add3_u32 v9, v9, v2, 0x7fff +; GFX10-NEXT: v_bfe_u32 v13, v12, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_add3_u32 v15, v15, v3, 0x7fff ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v11 -; GFX10-NEXT: v_add3_u32 v13, v13, v11, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; GFX10-NEXT: v_add3_u32 v13, v13, v12, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v2, v9, v11, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v15, v16, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX10-NEXT: v_fmac_f32_e32 v4, v2, v5 -; GFX10-NEXT: v_fmac_f32_e32 v0, v2, v10 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v15, v16, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX10-NEXT: v_fmac_f32_e32 v4, v0, v5 +; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v10 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc_lo ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX10-NEXT: v_fmac_f32_e32 v1, v3, v10 +; GFX10-NEXT: v_bfe_u32 v0, v2, 16, 1 +; GFX10-NEXT: v_fmac_f32_e32 v3, v1, v10 ; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX10-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX10-NEXT: v_fmac_f32_e32 v7, v3, v5 -; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_add3_u32 v0, v0, v2, 0x7fff +; GFX10-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX10-NEXT: v_fmac_f32_e32 v7, v1, v5 +; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v3 ; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX10-NEXT: v_add3_u32 v9, v9, v1, 0x7fff +; GFX10-NEXT: v_add3_u32 v9, v9, v3, 0x7fff ; GFX10-NEXT: v_bfe_u32 v11, v7, 16, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7 ; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX10-NEXT: v_add3_u32 v11, v11, v7, 0x7fff @@ -4404,85 +4404,82 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX11-LABEL: fma_shuffle_v2bf16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x10 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] -; GFX11-NEXT: global_load_b64 v[2:3], v6, s[4:5] +; GFX11-NEXT: global_load_b64 v[0:1], v6, s[4:5] +; GFX11-NEXT: global_load_b64 v[2:3], v6, s[0:1] ; GFX11-NEXT: global_load_b64 v[4:5], v6, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v3 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_fmac_f32 v11, v12, v9 :: v_dual_and_b32 v2, 0xffff0000, v2 -; GFX11-NEXT: v_fmac_f32_e32 v1, v12, v4 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX11-NEXT: v_dual_fmac_f32 v12, v11, v9 :: v_dual_lshlrev_b32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v13, v12, 16, 1 +; GFX11-NEXT: v_fmac_f32_e32 v3, v11, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_bfe_u32 v13, v11, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11 -; GFX11-NEXT: v_bfe_u32 v15, v1, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_fmac_f32_e32 v7, v8, v9 -; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v1 -; GFX11-NEXT: v_add3_u32 v13, v13, v11, 0x7fff -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_add3_u32 v15, v15, v1, 0x7fff -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_fmac_f32_e32 v0, v8, v4 -; GFX11-NEXT: v_bfe_u32 v4, v7, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7 -; GFX11-NEXT: v_bfe_u32 v9, v0, 16, 1 +; GFX11-NEXT: v_fmac_f32_e32 v8, v7, v9 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; GFX11-NEXT: v_add3_u32 v13, v13, v12, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v15, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add3_u32 v15, v15, v3, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_fmac_f32_e32 v2, v7, v4 +; GFX11-NEXT: v_bfe_u32 v4, v8, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX11-NEXT: v_bfe_u32 v9, v2, 16, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add3_u32 v4, v4, v7, 0x7fff -; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v0 -; GFX11-NEXT: v_add3_u32 v9, v9, v0, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v4, v4, v8, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v2 +; GFX11-NEXT: v_add3_u32 v9, v9, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-NEXT: v_dual_fmac_f32 v4, v2, v5 :: v_dual_cndmask_b32 v1, v15, v16 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v9, v11, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v15, v16 :: v_dual_and_b32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fmac_f32_e32 v1, v3, v10 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX11-NEXT: v_fmac_f32_e32 v7, v3, v5 -; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add3_u32 v9, v9, v1, 0x7fff -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_fmac_f32 v7, v1, v5 :: v_dual_and_b32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_dual_fmac_f32 v4, v0, v5 :: v_dual_and_b32 v3, 0xffff0000, v3 +; GFX11-NEXT: v_fmac_f32_e32 v2, v0, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-NEXT: v_fmac_f32_e32 v3, v1, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX11-NEXT: v_bfe_u32 v0, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v0, v0, v2, 0x7fff ; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7 -; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_fmac_f32_e32 v0, v2, v10 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v1 ; GFX11-NEXT: v_add3_u32 v11, v11, v7, 0x7fff -; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_add3_u32 v9, v9, v3, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff ; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v11, v12, vcc_lo @@ -4491,7 +4488,7 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x7060302 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc_lo ; GFX11-NEXT: v_perm_b32 v0, v3, v0, 0x7060302 -; GFX11-NEXT: global_store_b64 v6, v[0:1], s[2:3] +; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index f78b408d78255..ae52a7ac515f3 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -5,7 +5,6 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-LABEL: v3i8_liveout: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v5, 2, v0 ; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 @@ -14,7 +13,7 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX906-NEXT: v_lshrrev_b32_e32 v4, 8, v2 -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB0_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dword v2, v5, s[6:7] @@ -22,11 +21,13 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX906-NEXT: v_lshrrev_b32_e32 v4, 8, v2 ; GFX906-NEXT: .LBB0_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v4 ; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: global_store_byte v1, v3, s[2:3] offset:2 -; GFX906-NEXT: global_store_short v1, v0, s[2:3] +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: global_store_byte v1, v3, s[0:1] offset:2 +; GFX906-NEXT: global_store_short v1, v0, s[0:1] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -49,7 +50,6 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-LABEL: v4i8_liveout: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GFX906-NEXT: v_mov_b32_e32 v1, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 @@ -59,7 +59,7 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v2 ; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX906-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB1_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dword v2, v6, s[6:7] @@ -68,13 +68,15 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX906-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX906-NEXT: .LBB1_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v5 ; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dword v1, v0, s[2:3] +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: global_store_dword v1, v0, s[0:1] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -97,7 +99,6 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-LABEL: v5i8_liveout: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v7, 3, v0 ; GFX906-NEXT: v_mov_b32_e32 v5, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 @@ -107,7 +108,7 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2] ; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB2_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v7, s[6:7] @@ -116,14 +117,16 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX906-NEXT: .LBB2_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v6 +; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v3 -; GFX906-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v1, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_byte v5, v2, s[2:3] offset:4 -; GFX906-NEXT: global_store_dword v5, v0, s[2:3] +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: global_store_byte v5, v2, s[0:1] offset:4 +; GFX906-NEXT: global_store_dword v5, v0, s[0:1] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -146,7 +149,6 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-LABEL: v8i8_liveout: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v10, 3, v0 ; GFX906-NEXT: v_mov_b32_e32 v3, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 @@ -159,7 +161,7 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX906-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v1 -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB3_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v10, s[6:7] @@ -171,10 +173,11 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v1 ; GFX906-NEXT: .LBB3_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v9 ; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v7 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX906-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v6 @@ -182,7 +185,8 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v4 ; GFX906-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx2 v3, v[0:1], s[2:3] +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -205,7 +209,6 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906-LABEL: v16i8_liveout: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b32_e32 v18, 4, v0 ; GFX906-NEXT: v_mov_b32_e32 v5, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 @@ -224,7 +227,7 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906-NEXT: v_lshrrev_b32_e32 v15, 24, v1 ; GFX906-NEXT: v_lshrrev_b32_e32 v16, 16, v1 ; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v1 -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB4_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx4 v[1:4], v18, s[6:7] @@ -242,7 +245,7 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906-NEXT: v_lshrrev_b32_e32 v16, 16, v1 ; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v1 ; GFX906-NEXT: .LBB4_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v17 ; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v15 @@ -256,6 +259,7 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v11 ; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v9 +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX906-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v8 @@ -263,7 +267,8 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v6 ; GFX906-NEXT: v_or_b32_sdwa v4, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v5, v[0:3], s[2:3] +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -286,13 +291,12 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906-LABEL: v32i8_liveout: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX906-NEXT: v_lshlrev_b32_e32 v31, 5, v0 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX906-NEXT: v_lshlrev_b32_e32 v33, 5, v0 ; GFX906-NEXT: v_mov_b32_e32 v9, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx4 v[1:4], v31, s[4:5] offset:16 -; GFX906-NEXT: global_load_dwordx4 v[5:8], v31, s[4:5] +; GFX906-NEXT: global_load_dwordx4 v[1:4], v33, s[4:5] offset:16 +; GFX906-NEXT: global_load_dwordx4 v[5:8], v33, s[4:5] ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v4 ; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v4 @@ -317,13 +321,13 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906-NEXT: v_lshrrev_b32_e32 v28, 16, v6 ; GFX906-NEXT: v_lshrrev_b32_e32 v29, 8, v6 ; GFX906-NEXT: v_lshrrev_b32_e32 v30, 24, v5 -; GFX906-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; GFX906-NEXT: v_lshrrev_b32_e32 v33, 8, v5 +; GFX906-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX906-NEXT: v_lshrrev_b32_e32 v32, 8, v5 ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB5_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx4 v[1:4], v31, s[6:7] offset:16 -; GFX906-NEXT: global_load_dwordx4 v[5:8], v31, s[6:7] +; GFX906-NEXT: global_load_dwordx4 v[1:4], v33, s[6:7] offset:16 +; GFX906-NEXT: global_load_dwordx4 v[5:8], v33, s[6:7] ; GFX906-NEXT: s_waitcnt vmcnt(1) ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v4 ; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v4 @@ -348,19 +352,20 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906-NEXT: v_lshrrev_b32_e32 v28, 16, v6 ; GFX906-NEXT: v_lshrrev_b32_e32 v29, 8, v6 ; GFX906-NEXT: v_lshrrev_b32_e32 v30, 24, v5 -; GFX906-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; GFX906-NEXT: v_lshrrev_b32_e32 v33, 8, v5 +; GFX906-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GFX906-NEXT: v_lshrrev_b32_e32 v32, 8, v5 ; GFX906-NEXT: .LBB5_2: ; %bb.2 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX906-NEXT: v_lshlrev_b16_e32 v30, 8, v30 -; GFX906-NEXT: v_lshlrev_b16_e32 v31, 8, v33 +; GFX906-NEXT: v_or_b32_sdwa v30, v31, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v31, 8, v32 ; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; GFX906-NEXT: v_lshlrev_b16_e32 v27, 8, v27 ; GFX906-NEXT: v_lshlrev_b16_e32 v26, 8, v26 ; GFX906-NEXT: v_lshlrev_b16_e32 v24, 8, v24 ; GFX906-NEXT: v_lshlrev_b16_e32 v23, 8, v23 ; GFX906-NEXT: v_lshlrev_b16_e32 v21, 8, v21 -; GFX906-NEXT: v_or_b32_sdwa v30, v32, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v5, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v6, v6, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -372,6 +377,7 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906-NEXT: v_or_b32_sdwa v6, v6, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v7, v7, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v8, v8, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[0:1] ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v20 @@ -415,20 +421,19 @@ bb.2: define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { ; GFX906-LABEL: v256i8_liveout: ; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: v_lshlrev_b32_e32 v63, 3, v0 ; GFX906-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX906-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX906-NEXT: s_mov_b32 s10, -1 -; GFX906-NEXT: s_mov_b32 s11, 0xe00000 -; GFX906-NEXT: s_add_u32 s8, s8, s3 -; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v63, 3, v0 -; GFX906-NEXT: s_addc_u32 s9, s9, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: global_load_dwordx4 v[17:20], v63, s[4:5] offset:240 ; GFX906-NEXT: global_load_dwordx4 v[5:8], v63, s[4:5] offset:224 ; GFX906-NEXT: global_load_dwordx4 v[9:12], v63, s[4:5] offset:208 ; GFX906-NEXT: global_load_dwordx4 v[13:16], v63, s[4:5] offset:192 +; GFX906-NEXT: s_mov_b32 s11, 0xe00000 +; GFX906-NEXT: s_add_u32 s8, s8, s3 +; GFX906-NEXT: s_addc_u32 s9, s9, 0 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX906-NEXT: v_mov_b32_e32 v4, 0 ; GFX906-NEXT: s_waitcnt vmcnt(3) @@ -844,7 +849,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v1 ; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v0 -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX906-NEXT: s_cbranch_execz .LBB6_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v63, s[6:7] offset:240 @@ -1265,7 +1270,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v0 ; GFX906-NEXT: .LBB6_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61 ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Reload @@ -1273,6 +1278,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v63, off, s[8:11], 0 offset:760 ; 4-byte Folded Reload +; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX906-NEXT: s_waitcnt vmcnt(2) ; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61 ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -1304,7 +1310,8 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:732 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:728 ; 4-byte Folded Reload @@ -1348,7 +1355,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:680 ; 4-byte Folded Reload @@ -1392,7 +1399,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:32 +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 ; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:632 ; 4-byte Folded Reload @@ -1436,7 +1443,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:48 +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 ; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:584 ; 4-byte Folded Reload @@ -1480,7 +1487,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:64 +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 ; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:536 ; 4-byte Folded Reload @@ -1524,7 +1531,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:80 +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 ; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:488 ; 4-byte Folded Reload @@ -1568,7 +1575,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:96 +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 ; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:440 ; 4-byte Folded Reload @@ -1612,7 +1619,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:112 +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 ; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:392 ; 4-byte Folded Reload @@ -1656,7 +1663,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:128 +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:128 ; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:344 ; 4-byte Folded Reload @@ -1700,7 +1707,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:144 +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:144 ; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:296 ; 4-byte Folded Reload @@ -1744,7 +1751,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:160 +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:160 ; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:248 ; 4-byte Folded Reload @@ -1788,7 +1795,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:176 +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:176 ; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:204 ; 4-byte Folded Reload @@ -1831,7 +1838,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX906-NEXT: v_or_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:192 +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:192 ; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:152 ; 4-byte Folded Reload @@ -1873,7 +1880,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:208 +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:208 ; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload @@ -1915,7 +1922,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:224 +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:224 ; GFX906-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload ; GFX906-NEXT: s_nop 0 ; GFX906-NEXT: buffer_load_dword v5, off, s[8:11], 0 ; 4-byte Folded Reload @@ -1963,7 +1970,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:240 +; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:240 ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 901e88a4c6aca..a9ff1b9ccbbc4 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1261,26 +1261,25 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %dummy) #0 { ; GFX1032-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX1032-NEXT: s_mov_b32 null, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e64 s2, 0, v0 ; GFX1032-NEXT: s_mov_b32 vcc_lo, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx3 v[1:3], v1, s[6:7] -; GFX1032-NEXT: s_and_saveexec_b32 s1, s0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, s2 ; GFX1032-NEXT: s_cbranch_execz .LBB22_2 ; GFX1032-NEXT: ; %bb.1: ; %bb +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: global_load_dword v0, v0, s[2:3] glc dlc +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dword v0, v0, s[0:1] glc dlc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_b32 vcc_lo, vcc_lo, exec_lo ; GFX1032-NEXT: .LBB22_2: ; %exit ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_div_fmas_f32 v1, v1, v2, v3 @@ -1289,26 +1288,25 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, p ; ; GFX1064-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1064-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX1064-NEXT: s_mov_b32 null, 0 -; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v0 ; GFX1064-NEXT: s_mov_b64 vcc, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx3 v[1:3], v1, s[6:7] -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; GFX1064-NEXT: s_cbranch_execz .LBB22_2 ; GFX1064-NEXT: ; %bb.1: ; %bb +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: global_load_dword v0, v0, s[8:9] glc dlc +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dword v0, v0, s[0:1] glc dlc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_b64 vcc, vcc, exec ; GFX1064-NEXT: .LBB22_2: ; %exit -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v3 diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll index ddc14b6a25c26..775b0a5a3394a 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll @@ -111,8 +111,7 @@ define protected amdgpu_kernel void @InferMixed(i32 %a, ptr addrspace(1) %b, dou ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_ashr_i32 s3, s2, 31 ; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 -; CHECK-NEXT: v_mov_b32_e32 v0, s8 -; CHECK-NEXT: v_mov_b32_e32 v1, s9 +; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] ; CHECK-NEXT: s_add_u32 s0, s4, s0 ; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] ; CHECK-NEXT: s_addc_u32 s1, s5, s1