diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp index 8d94b40a41bea..7e161bfb39ac1 100644 --- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -1988,6 +1988,7 @@ void TwoAddressInstructionImpl::eliminateRegSequence( SmallVector OrigRegs; VNInfo *DefVN = nullptr; + bool DefEmitted = false; if (LIS) { OrigRegs.push_back(MI.getOperand(0).getReg()); for (unsigned i = 1, e = MI.getNumOperands(); i < e; i += 2) @@ -1998,9 +1999,17 @@ void TwoAddressInstructionImpl::eliminateRegSequence( .valueOut(); } } - + for (unsigned i = 1, e = MI.getNumOperands(); i < e; i += 2) + if (MI.getOperand(i).isReg() && MI.getOperand(i).isUndef()) { + // Insert the IMPLICIT_DEF on dst register. + MachineInstr *DefMI = + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), + TII->get(TargetOpcode::IMPLICIT_DEF), DstReg); + MBBI = DefMI; + DefEmitted = true; + break; + } LaneBitmask UndefLanes = LaneBitmask::getNone(); - bool DefEmitted = false; for (unsigned i = 1, e = MI.getNumOperands(); i < e; i += 2) { MachineOperand &UseMO = MI.getOperand(i); Register SrcReg = UseMO.getReg(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll index 1cd9c0bfeb7e6..84247841691ab 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll @@ -8,37 +8,40 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac ; GFX10-LABEL: v_mul_i64_no_zext: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 3, v0 +; GFX10-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[0:1] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v8, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v8, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, v0, v2, 0 -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, v0, v3, v[5:6] +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, v0, v3, v[6:7] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v2, v[5:6] ; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[2:3] +; GFX10-NEXT: global_store_dwordx2 v8, v[4:5], s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_mul_i64_no_zext: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v9, 3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[0:1], v9, s[0:1] -; GFX11-NEXT: global_load_b64 v[2:3], v9, s[2:3] +; GFX11-NEXT: global_load_b64 v[0:1], v10, s[0:1] +; GFX11-NEXT: global_load_b64 v[2:3], v10, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v0, v2, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v0, v3, v[5:6] -; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v1, v2, v[6:7] +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v0, v3, v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v5, v7 -; GFX11-NEXT: global_store_b64 v9, v[4:5], s[2:3] +; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v1, v2, v[8:9] +; GFX11-NEXT: global_store_b64 v10, v[4:5], s[2:3] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid @@ -60,13 +63,14 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] -; GFX10-NEXT: global_load_dword v4, v3, s[6:7] +; GFX10-NEXT: global_load_dword v6, v3, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v0, v4, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v1, v4, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v0, v6, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v1, v6, v[4:5] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] @@ -78,18 +82,21 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v1, s[2:3] -; GFX11-NEXT: global_load_b32 v5, v2, s[4:5] +; GFX11-NEXT: global_load_b32 v8, v2, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, v5, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v1, v5, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, v8, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v1, v8, v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v6 ; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -112,13 +119,14 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v2, s[2:3] +; GFX10-NEXT: global_load_dword v6, v2, s[2:3] ; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v4, v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v6, v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v6, v1, v[4:5] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] @@ -130,18 +138,21 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v5, v1, s[2:3] +; GFX11-NEXT: global_load_b32 v8, v1, s[2:3] ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v8, v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v8, v1, v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v6 ; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -210,14 +221,15 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v4, v2, s[2:3] +; GFX10-NEXT: global_load_dword v6, v2, s[2:3] ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v4, v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v6, v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v6, v1, v[4:5] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] @@ -229,18 +241,21 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v8, v0, s[2:3] ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v8, v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v8, v1, v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v6 ; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -390,16 +405,17 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX10-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_e32 v6, 0xfff00000, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xfff00000, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[4:5], s2, v6, v2, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s2, v6, v3, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[4:5], s2, v0, v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v6, v5 +; GFX10-NEXT: v_mad_u64_u32 v[5:6], s2, v0, v3, v[6:7] ; GFX10-NEXT: v_and_b32_e32 v0, 0xf00f, v1 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v0, v2, v[5:6] ; GFX10-NEXT: v_mov_b32_e32 v5, v0 @@ -413,6 +429,7 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -420,15 +437,15 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX11-NEXT: global_load_b64 v[2:3], v2, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_and_b32_e32 v7, 0xfff00000, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xfff00000, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v7, v2, 0 -; GFX11-NEXT: v_mov_b32_e32 v0, v5 +; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v0, v2, 0 +; GFX11-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v7, v3, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v0, v3, v[6:7] ; GFX11-NEXT: v_and_b32_e32 v3, 0xf00f, v1 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[5:6] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[8:9] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v0, 0 ; GFX11-NEXT: global_store_b64 v0, v[4:5], s[0:1] @@ -510,7 +527,9 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1) ; GFX10-NEXT: ; %bb.1: ; %else ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s3, v2, v4, 0 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s3, v2, v5, v[1:2] +; GFX10-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s3, v2, v5, v[3:4] ; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX10-NEXT: .LBB10_2: ; %Flow @@ -547,11 +566,14 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1) ; GFX11-NEXT: ; %bb.1: ; %else ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0 +; GFX11-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v2, v5, v[1:2] -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX11-NEXT: v_mov_b32_e32 v1, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v2, v5, v[3:4] ; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v1, v6 ; GFX11-NEXT: .LBB10_2: ; %Flow ; GFX11-NEXT: s_and_not1_saveexec_b32 s2, s2 ; GFX11-NEXT: s_cbranch_execz .LBB10_4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 637aaf7529364..b2517431f6fbf 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -548,7 +548,9 @@ define i64 @v_mul_i64(i64 %num, i64 %den) { ; GCN-NEXT: v_mov_b32_e32 v4, v0 ; GCN-NEXT: v_mov_b32_e32 v5, v1 ; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 -; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2] +; GCN-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GCN-NEXT: v_mov_b32_e32 v6, v1 +; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[6:7] ; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4] ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -557,8 +559,10 @@ define i64 @v_mul_i64(i64 %num, i64 %den) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v2, 0 -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v4, v3, v[1:2] +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v4, v3, v[6:7] ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v5, v2, v[3:4] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -567,9 +571,11 @@ define i64 @v_mul_i64(i64 %num, i64 %den) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v2 ; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v5, 0 -; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v4, v3, v[1:2] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v5, v[7:8] +; GFX11-NEXT: v_mov_b32_e32 v7, v1 +; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v4, v3, v[7:8] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v5, v[9:10] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_mul_i64: @@ -3129,34 +3135,40 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 -; GFX7-NEXT: v_mov_b32_e32 v5, 0x50 +; GFX7-NEXT: buffer_load_dword v6, v[2:3], s[0:3], 0 addr64 +; GFX7-NEXT: v_mov_b32_e32 v7, 0x50 +; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0 -; GFX7-NEXT: v_ashrrev_i32_e32 v4, 31, v4 -; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v5, v[3:4] +; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v7, 0 +; GFX7-NEXT: v_ashrrev_i32_e32 v6, 31, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, v3 +; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[4:5] ; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_mul_u64_sext_with_vregs: ; GFX8: ; %bb.0: -; GFX8-NEXT: flat_load_dword v4, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v5, 0x50 +; GFX8-NEXT: flat_load_dword v6, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v7, 0x50 +; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0 -; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v4 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4] +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v6, v7, 0 +; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v6 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v6, v7, v[4:5] ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: s_mul_u64_sext_with_vregs: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dword v4, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v5, 0x50 +; GFX9-NEXT: global_load_dword v6, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v7, 0x50 +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0 -; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v4 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v6, v7, 0 +; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v6 +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v6, v7, v[4:5] ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_endpgm ; @@ -3165,8 +3177,10 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, 0x50, v4, 0 -; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v4 -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, 0x50, v4, v[3:4] +; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v4 +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, 0x50, v6, v[4:5] ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm ; @@ -3175,9 +3189,11 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX11-NEXT: global_load_b32 v4, v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v4, 0 -; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v4 -; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, 0x50, v6, v[3:4] -; GFX11-NEXT: v_mov_b32_e32 v3, v4 +; GFX11-NEXT: v_ashrrev_i32_e32 v8, 31, v4 +; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, 0x50, v8, v[4:5] +; GFX11-NEXT: v_mov_b32_e32 v3, v6 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index f57fc005b994b..5275ba3fd7bcc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -39,72 +39,74 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v3 ; CHECK-NEXT: v_trunc_f32_e32 v8, v6 ; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v8 -; CHECK-NEXT: v_cvt_u32_f32_e32 v9, v3 +; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CHECK-NEXT: v_cvt_u32_f32_e32 v12, v8 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0 -; CHECK-NEXT: v_mov_b32_e32 v3, v7 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v3, v12, v6 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; CHECK-NEXT: v_mul_hi_u32 v8, v9, v6 +; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v3, 0 +; CHECK-NEXT: v_mov_b32_e32 v8, v7 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[8:9] +; CHECK-NEXT: v_mul_hi_u32 v9, v3, v6 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v3, v[7:8] +; CHECK-NEXT: v_mul_lo_u32 v8, v12, v6 ; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6 -; CHECK-NEXT: v_mul_lo_u32 v13, v9, v7 +; CHECK-NEXT: v_mul_lo_u32 v13, v3, v7 ; CHECK-NEXT: v_mul_lo_u32 v14, v12, v7 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v13 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v13 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; CHECK-NEXT: v_mul_hi_u32 v8, v9, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v13, v3 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CHECK-NEXT: v_mul_hi_u32 v9, v3, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v13, v8 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v14, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; CHECK-NEXT: v_mul_hi_u32 v7, v12, v7 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; CHECK-NEXT: v_mul_hi_u32 v7, v12, v7 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v3 -; CHECK-NEXT: v_addc_u32_e32 v12, vcc, v12, v6, vcc -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0 -; CHECK-NEXT: v_mov_b32_e32 v3, v7 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v10, 31, v5 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v10 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v10, vcc -; CHECK-NEXT: v_xor_b32_e32 v8, v3, v10 -; CHECK-NEXT: v_mul_lo_u32 v3, v12, v6 -; CHECK-NEXT: v_mul_lo_u32 v5, v9, v7 -; CHECK-NEXT: v_xor_b32_e32 v11, v4, v10 -; CHECK-NEXT: v_mul_hi_u32 v4, v9, v6 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_addc_u32_e32 v12, vcc, v12, v7, vcc +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v3, 0 +; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9 +; CHECK-NEXT: v_mov_b32_e32 v8, v7 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[8:9] +; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v5 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v3, v[7:8] +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v9, vcc +; CHECK-NEXT: v_xor_b32_e32 v8, v4, v9 +; CHECK-NEXT: v_mul_lo_u32 v4, v12, v6 +; CHECK-NEXT: v_mul_lo_u32 v10, v3, v7 +; CHECK-NEXT: v_xor_b32_e32 v11, v5, v9 +; CHECK-NEXT: v_mul_hi_u32 v5, v3, v6 ; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v12, v7 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v5, v9, v7 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v6, v12, v7 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, v12, v7 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CHECK-NEXT: v_mul_hi_u32 v10, v3, v7 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; CHECK-NEXT: v_mul_hi_u32 v7, v12, v7 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v9, v3 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v12, v5, vcc ; CHECK-NEXT: v_mul_lo_u32 v5, v11, v3 ; CHECK-NEXT: v_mul_lo_u32 v6, v8, v4 ; CHECK-NEXT: v_mul_hi_u32 v7, v8, v3 ; CHECK-NEXT: v_mul_hi_u32 v3, v11, v3 -; CHECK-NEXT: v_mul_hi_u32 v9, v11, v4 +; CHECK-NEXT: v_mul_hi_u32 v10, v11, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 @@ -121,38 +123,40 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v7, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v9, v5 -; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v5 +; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v8, v3 +; CHECK-NEXT: v_mov_b32_e32 v5, v4 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[5:6] ; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v7, v[4:5] ; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v11, v4, vcc ; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v11, v4 ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 ; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 ; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v1 ; CHECK-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v8, vcc, 1, v7 -; CHECK-NEXT: v_addc_u32_e32 v9, vcc, 0, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v7 +; CHECK-NEXT: v_addc_u32_e32 v8, vcc, 0, v10, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v2, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v8 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v9, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v6 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v8, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v10, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; CHECK-NEXT: v_xor_b32_e32 v3, v9, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3 ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -220,60 +224,64 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; CHECK-NEXT: v_trunc_f32_e32 v2, v1 ; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v0 -; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v2 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2] -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2] -; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 -; CHECK-NEXT: v_mul_lo_u32 v6, v3, v1 -; CHECK-NEXT: v_mul_lo_u32 v7, v4, v1 -; CHECK-NEXT: v_mul_hi_u32 v8, v3, v1 -; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1 +; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v0 +; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v4, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, v1 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v5, v[2:3] +; CHECK-NEXT: v_mul_hi_u32 v3, v4, v0 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v4, v[1:2] +; CHECK-NEXT: v_mul_lo_u32 v2, v5, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0 +; CHECK-NEXT: v_mul_lo_u32 v6, v4, v1 +; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1 +; CHECK-NEXT: v_mul_hi_u32 v8, v4, v1 +; CHECK-NEXT: v_mul_hi_u32 v1, v5, v1 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v0 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2] -; CHECK-NEXT: v_mul_hi_u32 v6, v3, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2] -; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 -; CHECK-NEXT: v_mul_lo_u32 v5, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v0 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v4, 0 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CHECK-NEXT: v_mov_b32_e32 v2, v1 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v5, v[2:3] +; CHECK-NEXT: v_mul_hi_u32 v6, v4, v0 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v4, v[1:2] +; CHECK-NEXT: v_mul_lo_u32 v2, v5, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0 +; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v4, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v1 +; CHECK-NEXT: v_mul_lo_u32 v6, v5, v1 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; CHECK-NEXT: v_mul_hi_u32 v1, v5, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc ; CHECK-NEXT: v_mul_lo_u32 v2, s13, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, s12, v1 ; CHECK-NEXT: v_mul_hi_u32 v4, s12, v0 @@ -295,12 +303,14 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v2, v[1:2] -; CHECK-NEXT: v_mov_b32_e32 v5, s13 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2] +; CHECK-NEXT: v_mov_b32_e32 v2, v1 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v5, v[2:3] +; CHECK-NEXT: v_mov_b32_e32 v5, s13 ; CHECK-NEXT: v_mov_b32_e32 v3, s11 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2] ; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v5, v1, vcc ; CHECK-NEXT: v_sub_i32_e64 v1, s[0:1], s13, v1 ; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v2 @@ -383,6 +393,7 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v10 ; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v4 ; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v10 +; GISEL-NEXT: ; implicit-def: $vgpr13_vgpr14 ; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v4, vcc ; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v9 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 @@ -390,184 +401,188 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v5 ; GISEL-NEXT: v_trunc_f32_e32 v9, v9 ; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0 -; GISEL-NEXT: v_mov_b32_e32 v5, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v9, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v9, v11 -; GISEL-NEXT: v_mul_hi_u32 v17, v14, v11 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v5, 0 +; GISEL-NEXT: v_mov_b32_e32 v13, v12 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v9, v[13:14] +; GISEL-NEXT: v_mul_hi_u32 v17, v5, v11 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v5, v[12:13] +; GISEL-NEXT: v_mul_lo_u32 v13, v9, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; GISEL-NEXT: v_mul_lo_u32 v14, v5, v12 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v17, v9, v12 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_hi_u32 v14, v5, v12 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v17, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13 -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v5 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v11, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v12, vcc +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v5, 0 +; GISEL-NEXT: ; implicit-def: $vgpr13_vgpr14 ; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_mov_b32_e32 v5, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[5:6] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] +; GISEL-NEXT: v_mov_b32_e32 v13, v12 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[13:14] ; GISEL-NEXT: v_xor_b32_e32 v15, v0, v9 ; GISEL-NEXT: v_mul_lo_u32 v0, v17, v11 -; GISEL-NEXT: v_mul_lo_u32 v5, v14, v12 -; GISEL-NEXT: v_xor_b32_e32 v16, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v1, v14, v11 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v5, v[12:13] +; GISEL-NEXT: v_mul_hi_u32 v14, v5, v11 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v5, v12 ; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v9 +; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v5, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 +; GISEL-NEXT: v_mul_lo_u32 v14, v17, v12 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; GISEL-NEXT: v_mul_hi_u32 v13, v5, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; GISEL-NEXT: v_mul_hi_u32 v11, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v16, v0 -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v1 -; GISEL-NEXT: v_mul_hi_u32 v12, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v17, v11, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v1, v0 +; GISEL-NEXT: v_mul_lo_u32 v12, v15, v5 +; GISEL-NEXT: v_mul_hi_u32 v13, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v1, v0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v16, v1 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; GISEL-NEXT: v_mul_hi_u32 v11, v15, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; GISEL-NEXT: v_mul_lo_u32 v13, v1, v5 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v12, v15, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v12 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 +; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5 ; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v0, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v1, v5 -; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v10, v14, v[1:2] +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v5, v13 +; GISEL-NEXT: ; implicit-def: $vgpr13_vgpr14 ; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v7 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v5 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v5, vcc +; GISEL-NEXT: v_mov_b32_e32 v13, v12 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v10, v16, v[13:14] +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v5 ; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v0, v[12:13] -; GISEL-NEXT: v_xor_b32_e32 v7, v1, v5 -; GISEL-NEXT: v_xor_b32_e32 v6, v6, v5 -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v7, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v7, v6, v5 +; GISEL-NEXT: v_xor_b32_e32 v6, v13, v5 +; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v7 +; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v6 ; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v15, v11 -; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v16, v12 -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v13 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v16, v12, vcc -; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v11, v4, vcc -; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v16, v11 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v16 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v1, v12, vcc +; GISEL-NEXT: v_mac_f32_e32 v13, 0x4f800000, v14 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v13 +; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v12 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 +; GISEL-NEXT: v_mul_f32_e32 v12, 0x2f800000, v11 +; GISEL-NEXT: v_trunc_f32_e32 v13, v12 +; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v13 +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v11 ; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v7 ; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v6, vcc ; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v10 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v10 -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v10 -; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v4 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13] +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v15, v10 +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v13 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v14, v10 +; GISEL-NEXT: ; implicit-def: $vgpr13_vgpr14 +; GISEL-NEXT: v_mul_lo_u32 v10, v15, v11 +; GISEL-NEXT: v_mov_b32_e32 v13, v12 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v15, v[13:14] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v1, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v18, v12 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v18, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v4 +; GISEL-NEXT: v_mul_lo_u32 v13, v18, v12 +; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v18, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v11, v15, v11 +; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v4 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v10, v13, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v4, v15, v21, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v14, vcc +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9] ; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v15, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v17, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v21, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v13, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v10, v17, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v15, v15, v21, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v13, v16, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_mul_hi_u32 v13, v18, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v21, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; GISEL-NEXT: v_mul_lo_u32 v14, v15, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 +; GISEL-NEXT: v_mul_hi_u32 v14, v18, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v12, v16, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v12, v15, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v18, v10 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v16, v11, vcc +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v15, v11, vcc ; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v19, v12, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v11 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v13, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v14, v[0:1] ; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc +; GISEL-NEXT: v_xor_b32_e32 v9, v4, v8 ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v12, v[0:1] ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v11 ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v11, vcc ; GISEL-NEXT: v_xor_b32_e32 v15, v1, v11 -; GISEL-NEXT: v_mul_lo_u32 v1, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v1, v14, v10 ; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 ; GISEL-NEXT: v_xor_b32_e32 v16, v2, v11 ; GISEL-NEXT: v_mul_hi_u32 v2, v12, v10 -; GISEL-NEXT: v_xor_b32_e32 v9, v4, v8 +; GISEL-NEXT: v_mul_hi_u32 v4, v14, v10 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v4, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -575,18 +590,18 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v13, v0, vcc +; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v14, v0, vcc ; GISEL-NEXT: v_mul_lo_u32 v2, v16, v1 ; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 ; GISEL-NEXT: v_mul_hi_u32 v4, v15, v1 ; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 -; GISEL-NEXT: v_xor_b32_e32 v10, v14, v8 +; GISEL-NEXT: v_xor_b32_e32 v10, v13, v8 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -605,6 +620,7 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v1 +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v13, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v9, v8 @@ -667,100 +683,106 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v1, v3, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v3, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v1 -; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v15, vcc, 0, v2 +; CGP-NEXT: ; implicit-def: $vgpr12_vgpr13 +; CGP-NEXT: v_subb_u32_e32 v16, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 ; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 ; CGP-NEXT: v_trunc_f32_e32 v5, v4 ; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v12, v3 -; CGP-NEXT: v_cvt_u32_f32_e32 v15, v5 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5] -; CGP-NEXT: v_mul_hi_u32 v16, v12, v3 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v5, v15, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v15, v3 -; CGP-NEXT: v_mul_lo_u32 v17, v12, v4 -; CGP-NEXT: v_mul_lo_u32 v18, v15, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v16 -; CGP-NEXT: v_mul_hi_u32 v16, v12, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v14, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v17, v5 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v15, v14, 0 +; CGP-NEXT: v_mov_b32_e32 v12, v4 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v17, v[12:13] +; CGP-NEXT: v_mul_hi_u32 v12, v14, v3 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[4:5] +; CGP-NEXT: v_mul_lo_u32 v5, v17, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v17, v3 +; CGP-NEXT: v_mul_lo_u32 v13, v14, v4 +; CGP-NEXT: v_mul_lo_u32 v18, v17, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v14, v4 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v17, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v18, v3 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_hi_u32 v4, v17, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v3 -; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v4, vcc -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v11 -; CGP-NEXT: v_mul_hi_u32 v16, v12, v3 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5] -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v13 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v13 -; CGP-NEXT: v_mul_lo_u32 v5, v15, v3 -; CGP-NEXT: v_mul_lo_u32 v14, v12, v4 -; CGP-NEXT: v_mul_hi_u32 v3, v15, v3 -; CGP-NEXT: v_xor_b32_e32 v10, v10, v13 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v16 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v3 +; CGP-NEXT: v_addc_u32_e32 v17, vcc, v17, v4, vcc +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v15, v14, 0 +; CGP-NEXT: ; implicit-def: $vgpr12_vgpr13 +; CGP-NEXT: v_mov_b32_e32 v12, v4 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v17, v[12:13] +; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[4:5] +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v12 +; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v13, v5, v12 +; CGP-NEXT: v_mul_lo_u32 v5, v17, v3 +; CGP-NEXT: v_mul_lo_u32 v11, v14, v4 +; CGP-NEXT: v_xor_b32_e32 v15, v10, v12 +; CGP-NEXT: v_mul_hi_u32 v10, v14, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v17, v3 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v15, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 -; CGP-NEXT: v_mul_hi_u32 v14, v12, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v16, v3 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 +; CGP-NEXT: v_mul_lo_u32 v10, v17, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; CGP-NEXT: v_mul_hi_u32 v11, v14, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v3 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_mul_hi_u32 v4, v17, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v3 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v15, v4, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v10, v3 -; CGP-NEXT: v_mul_lo_u32 v12, v11, v4 -; CGP-NEXT: v_mul_hi_u32 v14, v11, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v10, v3 -; CGP-NEXT: v_mul_hi_u32 v15, v10, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v10, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_mul_hi_u32 v12, v11, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v17, v4, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v15, v3 +; CGP-NEXT: v_mul_lo_u32 v10, v13, v4 +; CGP-NEXT: v_mul_hi_u32 v11, v13, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v15, v3 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v15, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v13, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v11, v3 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v3, v5 +; CGP-NEXT: v_mul_hi_u32 v11, v15, v4 ; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v14, 0 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v5 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v12, v[4:5] -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v11, v3 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v11, v5 +; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v13, v3 +; CGP-NEXT: v_mov_b32_e32 v10, v4 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v16, v[10:11] ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v14, v[4:5] -; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v10, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v10, v4 +; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v15, v4, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v15, v4 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 ; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] @@ -771,13 +793,13 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc ; CGP-NEXT: v_cndmask_b32_e64 v5, v10, v11, s[4:5] ; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v14 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc +; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v16, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v15, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v2, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v10 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 @@ -785,8 +807,8 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v13, v0 -; CGP-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v3, v12, v0 +; CGP-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v1, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v2, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -840,100 +862,106 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v3, v5, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v5, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 -; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v4 +; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 +; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 ; CGP-NEXT: v_trunc_f32_e32 v7, v6 ; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v10, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7] -; CGP-NEXT: v_mul_hi_u32 v14, v10, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7] -; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 -; CGP-NEXT: v_mul_lo_u32 v15, v10, v6 -; CGP-NEXT: v_mul_lo_u32 v16, v13, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v12, v5 +; CGP-NEXT: v_cvt_u32_f32_e32 v15, v7 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 +; CGP-NEXT: v_mov_b32_e32 v10, v6 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[10:11] +; CGP-NEXT: v_mul_hi_u32 v10, v12, v5 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[6:7] +; CGP-NEXT: v_mul_lo_u32 v7, v15, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v15, v5 +; CGP-NEXT: v_mul_lo_u32 v11, v12, v6 +; CGP-NEXT: v_mul_lo_u32 v16, v15, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v12, v6 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v15, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_mul_hi_u32 v6, v13, v6 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v6, v15, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v14, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v6, vcc -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7] -; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7] -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v11 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v11, vcc -; CGP-NEXT: v_xor_b32_e32 v9, v7, v11 -; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 -; CGP-NEXT: v_mul_lo_u32 v12, v10, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 -; CGP-NEXT: v_xor_b32_e32 v8, v8, v11 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v5 +; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v6, vcc +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 +; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 +; CGP-NEXT: v_mov_b32_e32 v10, v6 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[10:11] +; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[6:7] +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v10 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v10, vcc +; CGP-NEXT: v_xor_b32_e32 v9, v7, v10 +; CGP-NEXT: v_mul_lo_u32 v7, v15, v5 +; CGP-NEXT: v_mul_lo_u32 v11, v12, v6 +; CGP-NEXT: v_xor_b32_e32 v13, v8, v10 +; CGP-NEXT: v_mul_hi_u32 v8, v12, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v15, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v13, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; CGP-NEXT: v_mul_hi_u32 v12, v10, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; CGP-NEXT: v_mul_hi_u32 v6, v13, v6 +; CGP-NEXT: v_mul_lo_u32 v8, v15, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 +; CGP-NEXT: v_mul_hi_u32 v11, v12, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CGP-NEXT: v_mul_hi_u32 v6, v15, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v13, v6, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v8, v5 -; CGP-NEXT: v_mul_lo_u32 v10, v9, v6 -; CGP-NEXT: v_mul_hi_u32 v12, v9, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v8, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; CGP-NEXT: v_mul_hi_u32 v10, v9, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v5, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v12, 0 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v15, v6, vcc +; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 +; CGP-NEXT: v_mul_lo_u32 v8, v9, v6 +; CGP-NEXT: v_mul_hi_u32 v11, v9, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 +; CGP-NEXT: v_mul_hi_u32 v12, v13, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v7 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v10, v[6:7] +; CGP-NEXT: v_mul_lo_u32 v11, v13, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_mul_hi_u32 v8, v9, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v5, v7 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, 0 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v7 +; CGP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[6:7] -; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v8, v6, vcc -; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v8, v6 +; CGP-NEXT: v_mov_b32_e32 v7, v6 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v12, v[7:8] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v11, v[6:7] +; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v13, v6, vcc +; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v13, v6 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 ; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] @@ -943,8 +971,8 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3 ; CGP-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc ; CGP-NEXT: v_cndmask_b32_e64 v7, v8, v9, s[4:5] -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v12 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v11 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v12, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4 @@ -957,9 +985,9 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v11, v2 -; CGP-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v10, v2 +; CGP-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v2, v3, v5 ; CGP-NEXT: v_xor_b32_e32 v3, v4, v5 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 @@ -1049,77 +1077,81 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v6, 0xffed2705 +; CHECK-NEXT: v_mov_b32_e32 v7, 0xffed2705 ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v4, v3 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 +; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v4 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v6, 0 +; CHECK-NEXT: v_mov_b32_e32 v4, v3 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v8, v[4:5] +; CHECK-NEXT: v_mul_hi_u32 v5, v6, v2 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v6, v[3:4] +; CHECK-NEXT: v_mul_lo_u32 v4, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v8, v2 +; CHECK-NEXT: v_mul_lo_u32 v9, v6, v3 +; CHECK-NEXT: v_mul_lo_u32 v10, v8, v3 +; CHECK-NEXT: v_mul_hi_u32 v11, v6, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v8, v3 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2 -; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc -; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6 -; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3 -; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6 -; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v2 +; CHECK-NEXT: v_addc_u32_e32 v8, vcc, v8, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v6, 0 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: v_mov_b32_e32 v4, v3 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v8, v[4:5] +; CHECK-NEXT: v_ashrrev_i32_e32 v5, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v6, v[3:4] +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc +; CHECK-NEXT: v_xor_b32_e32 v4, v0, v5 +; CHECK-NEXT: v_mul_lo_u32 v0, v8, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, v6, v3 +; CHECK-NEXT: v_xor_b32_e32 v9, v1, v5 +; CHECK-NEXT: v_mul_hi_u32 v1, v6, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v8, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3 +; CHECK-NEXT: v_mul_lo_u32 v1, v8, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; CHECK-NEXT: v_mul_hi_u32 v7, v6, v3 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; CHECK-NEXT: v_mul_hi_u32 v3, v8, v3 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v8, v1, vcc ; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1 ; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0 ; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0 -; CHECK-NEXT: v_mov_b32_e32 v5, 0x12d8fb +; CHECK-NEXT: v_mov_b32_e32 v6, 0x12d8fb ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 @@ -1134,39 +1166,41 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v0, v2 ; CHECK-NEXT: v_mul_hi_u32 v8, v9, v1 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v7, 0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v3, v[1:2] +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 +; CHECK-NEXT: v_mov_b32_e32 v2, v1 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v8, v[2:3] ; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc ; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v6 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v7 -; CHECK-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v2, -1, v3, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v7 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v4 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v3 +; CHECK-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v5 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v5 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv i64 %num, 1235195 ret i64 %result @@ -1179,6 +1213,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb ; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 ; GISEL-NEXT: v_mov_b32_e32 v6, 0xffed2705 +; GISEL-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 @@ -1215,42 +1250,43 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v7, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0 +; GISEL-NEXT: ; implicit-def: $vgpr15_vgpr16 ; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc -; GISEL-NEXT: v_mov_b32_e32 v4, v14 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15] +; GISEL-NEXT: v_mov_b32_e32 v15, v14 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[15:16] +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v4, v[14:15] ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v9, v16, v14 +; GISEL-NEXT: v_mul_lo_u32 v15, v4, v14 ; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v16, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v9, v17, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 +; GISEL-NEXT: v_mul_hi_u32 v15, v4, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v15, v17, v14 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v4, v14 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v9 ; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc ; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v16 +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v14 ; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v15, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 ; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 @@ -1275,8 +1311,10 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v13 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2] +; GISEL-NEXT: ; implicit-def: $vgpr13_vgpr14 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v0 +; GISEL-NEXT: v_mov_b32_e32 v13, v1 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[13:14] ; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc ; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v13 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 @@ -1291,8 +1329,9 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GISEL-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5] +; GISEL-NEXT: v_mov_b32_e32 v0, v5 ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1] ; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v13 @@ -1318,72 +1357,76 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v1 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v1 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v16, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v11, v9 -; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v15, v13, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v2, v11 -; GISEL-NEXT: v_mul_lo_u32 v2, v8, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v7, v5 -; GISEL-NEXT: v_xor_b32_e32 v13, v3, v11 -; GISEL-NEXT: v_mul_hi_u32 v3, v7, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v15, v13, vcc +; GISEL-NEXT: v_xor_b32_e32 v13, v7, v9 +; GISEL-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v16, v5, vcc +; GISEL-NEXT: v_mov_b32_e32 v7, v1 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v11, v[7:8] +; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[5:6] +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_xor_b32_e32 v8, v2, v7 +; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 +; GISEL-NEXT: v_xor_b32_e32 v1, v12, v9 +; GISEL-NEXT: v_xor_b32_e32 v12, v3, v7 +; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v8, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5 +; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 -; GISEL-NEXT: v_mul_lo_u32 v6, v12, v2 -; GISEL-NEXT: v_xor_b32_e32 v10, v10, v9 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v3 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, v12, v3 +; GISEL-NEXT: v_mul_lo_u32 v6, v8, v2 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v13, v9 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; GISEL-NEXT: v_mul_hi_u32 v9, v8, v3 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v13, v2 -; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 +; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v3, v12, v3 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_mul_hi_u32 v6, v12, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v6, v8, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v8, v13, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v3, v5 +; GISEL-NEXT: v_mul_hi_u32 v10, v12, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v8, v[3:4] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v5 +; GISEL-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_mov_b32_e32 v5, v3 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v10, v[5:6] +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v12, v5 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -1391,24 +1434,24 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v3, -1, v6, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v7 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v8, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v9 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v10, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v6 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v11 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v11 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v11 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v11, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v7 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_oddk_denom: @@ -1417,6 +1460,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 ; CGP-NEXT: v_mov_b32_e32 v6, 0xffed2705 +; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 @@ -1450,39 +1494,40 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0 +; CGP-NEXT: ; implicit-def: $vgpr15_vgpr16 ; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc -; CGP-NEXT: v_mov_b32_e32 v4, v14 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v17, v13 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v16, v13 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v9, v17, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v17, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v15, v16, v14 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; CGP-NEXT: v_mov_b32_e32 v15, v14 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[15:16] +; CGP-NEXT: v_mul_lo_u32 v9, v17, v13 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v4, v[14:15] +; CGP-NEXT: v_mul_lo_u32 v15, v4, v14 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15 +; CGP-NEXT: v_mul_hi_u32 v15, v4, v13 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v13, v17, v13 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v15, v17, v14 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v4, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v9 ; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc ; CGP-NEXT: v_xor_b32_e32 v18, v0, v9 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v17, v14 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v13, v16 +; CGP-NEXT: v_mul_hi_u32 v13, v17, v14 ; CGP-NEXT: v_xor_b32_e32 v19, v1, v9 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc ; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 ; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 @@ -1507,115 +1552,122 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v13 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2] +; CGP-NEXT: ; implicit-def: $vgpr13_vgpr14 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v18, v0 +; CGP-NEXT: v_mov_b32_e32 v13, v1 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[13:14] ; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc ; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v13 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, 1, v15 -; CGP-NEXT: v_addc_u32_e32 v18, vcc, 0, v16, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CGP-NEXT: v_mov_b32_e32 v0, v5 +; CGP-NEXT: v_sub_i32_e32 v13, vcc, v0, v4 +; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v1, vcc +; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CGP-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5] +; CGP-NEXT: v_mov_b32_e32 v0, v5 ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 +; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v15 +; CGP-NEXT: v_addc_u32_e32 v18, vcc, 0, v16, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v13, v4 ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1] -; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v19, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v17 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v18, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 -; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v13, v18, v13, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; CGP-NEXT: v_cndmask_b32_e32 v13, -1, v13, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v5 +; CGP-NEXT: v_addc_u32_e32 v17, vcc, 0, v18, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; CGP-NEXT: v_mul_lo_u32 v13, v7, v0 +; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v17, v18, v17, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v13 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_mul_hi_u32 v5, v7, v0 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_mul_lo_u32 v11, v8, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; CGP-NEXT: v_mul_hi_u32 v10, v7, v0 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1 +; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v17, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v10, v16, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v10, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] -; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v2, v10 -; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v7, v5 -; CGP-NEXT: v_xor_b32_e32 v13, v3, v10 -; CGP-NEXT: v_mul_hi_u32 v3, v7, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v5, vcc +; CGP-NEXT: v_mov_b32_e32 v7, v1 +; CGP-NEXT: v_xor_b32_e32 v13, v5, v9 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v11, v[7:8] +; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v3 +; CGP-NEXT: v_cndmask_b32_e32 v12, v16, v17, vcc +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[5:6] +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc +; CGP-NEXT: v_xor_b32_e32 v8, v2, v7 +; CGP-NEXT: v_mul_lo_u32 v2, v11, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v10, v5 +; CGP-NEXT: v_xor_b32_e32 v1, v12, v9 +; CGP-NEXT: v_xor_b32_e32 v12, v3, v7 +; CGP-NEXT: v_mul_hi_u32 v3, v10, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v11, v5 +; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v11, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 -; CGP-NEXT: v_mul_hi_u32 v7, v12, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v12, v3 +; CGP-NEXT: v_mul_lo_u32 v6, v8, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9 ; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v8, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v13, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 +; CGP-NEXT: v_mul_lo_u32 v9, v12, v2 +; CGP-NEXT: v_mul_hi_u32 v3, v12, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_mul_hi_u32 v6, v12, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v6, v8, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v8, v13, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v3, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v12, v2 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v8, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5 +; CGP-NEXT: ; implicit-def: $vgpr5_vgpr6 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 +; CGP-NEXT: v_mov_b32_e32 v5, v3 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v10, v[5:6] +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v12, v5 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -1623,24 +1675,24 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; CGP-NEXT: v_cndmask_b32_e64 v3, -1, v6, s[4:5] -; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v7 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v8, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v9 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v10, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v6 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc +; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; CGP-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v7 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v7 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i64> %num, ret <2 x i64> %result @@ -1676,126 +1728,132 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_xor_b32_e32 v1, v5, v0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v2 ; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1 -; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v2 -; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v10, vcc, 0, v2 +; CHECK-NEXT: v_subb_u32_e32 v11, vcc, 0, v1, vcc ; CHECK-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; CHECK-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 ; CHECK-NEXT: v_trunc_f32_e32 v7, v6 ; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v5 -; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v7 -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7] -; CHECK-NEXT: v_mul_hi_u32 v12, v8, v5 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_mul_lo_u32 v7, v11, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 -; CHECK-NEXT: v_mul_lo_u32 v13, v8, v6 -; CHECK-NEXT: v_mul_lo_u32 v14, v11, v6 +; CHECK-NEXT: v_cvt_u32_f32_e32 v9, v5 +; CHECK-NEXT: v_cvt_u32_f32_e32 v12, v7 +; CHECK-NEXT: ; implicit-def: $vgpr7_vgpr8 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v9, 0 +; CHECK-NEXT: v_mov_b32_e32 v7, v6 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v12, v[7:8] +; CHECK-NEXT: v_mul_hi_u32 v8, v9, v5 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v9, v[6:7] +; CHECK-NEXT: v_mul_lo_u32 v7, v12, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v12, v5 +; CHECK-NEXT: v_mul_lo_u32 v13, v9, v6 +; CHECK-NEXT: v_mul_lo_u32 v14, v12, v6 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CHECK-NEXT: v_mul_hi_u32 v12, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CHECK-NEXT: v_mul_hi_u32 v8, v9, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v13, v7 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v14, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v5 -; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v6, vcc -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7] -; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v9, vcc -; CHECK-NEXT: v_xor_b32_e32 v7, v3, v9 -; CHECK-NEXT: v_mul_lo_u32 v3, v11, v5 -; CHECK-NEXT: v_mul_lo_u32 v10, v8, v6 -; CHECK-NEXT: v_xor_b32_e32 v12, v4, v9 -; CHECK-NEXT: v_mul_hi_u32 v4, v8, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v5 +; CHECK-NEXT: v_addc_u32_e32 v12, vcc, v12, v6, vcc +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v9, 0 +; CHECK-NEXT: ; implicit-def: $vgpr7_vgpr8 +; CHECK-NEXT: v_mov_b32_e32 v7, v6 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v12, v[7:8] +; CHECK-NEXT: v_ashrrev_i32_e32 v8, 31, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v9, v[6:7] +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v8, vcc +; CHECK-NEXT: v_xor_b32_e32 v7, v3, v8 +; CHECK-NEXT: v_mul_lo_u32 v3, v12, v5 +; CHECK-NEXT: v_mul_lo_u32 v10, v9, v6 +; CHECK-NEXT: v_xor_b32_e32 v11, v4, v8 +; CHECK-NEXT: v_mul_hi_u32 v4, v9, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v12, v5 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v11, v6 +; CHECK-NEXT: v_mul_lo_u32 v4, v12, v6 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v10, v3 -; CHECK-NEXT: v_mul_hi_u32 v10, v8, v6 +; CHECK-NEXT: v_mul_hi_u32 v10, v9, v6 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v11, v4, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v12, v3 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, v11, v3 ; CHECK-NEXT: v_mul_lo_u32 v6, v7, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v12, v3 -; CHECK-NEXT: v_mul_hi_u32 v10, v12, v4 +; CHECK-NEXT: v_mul_hi_u32 v9, v7, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v11, v3 +; CHECK-NEXT: v_mul_hi_u32 v10, v11, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v8, v12, v4 +; CHECK-NEXT: v_mul_lo_u32 v9, v11, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CHECK-NEXT: v_mul_hi_u32 v6, v7, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v3, v5 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v8, 0 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v3, v5 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v9, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v10, v5 -; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v5 +; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v7, v3 -; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v8, v[4:5] -; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v4, vcc -; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v4 +; CHECK-NEXT: v_mov_b32_e32 v5, v4 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[5:6] +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v9, v[4:5] +; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v11, v4, vcc +; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v11, v4 ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 ; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 ; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v1 ; CHECK-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v7, vcc, 1, v8 -; CHECK-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v5, v6, v7, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v9 +; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v2, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v7 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v10, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v6 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v7, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v9, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v3, v8, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3 ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -1856,182 +1914,188 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v7 ; GISEL-NEXT: v_trunc_f32_e32 v13, v11 ; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v13 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0 -; GISEL-NEXT: v_mov_b32_e32 v7, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8] -; GISEL-NEXT: v_mul_lo_u32 v7, v17, v11 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] -; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v11 +; GISEL-NEXT: ; implicit-def: $vgpr13_vgpr14 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v7, 0 +; GISEL-NEXT: v_mov_b32_e32 v13, v12 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[13:14] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v7, v[12:13] +; GISEL-NEXT: v_mul_lo_u32 v13, v17, v11 +; GISEL-NEXT: v_mul_lo_u32 v14, v7, v12 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v18, v7 -; GISEL-NEXT: v_mul_hi_u32 v18, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v14, v17, v12 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v7, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18 ; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v7 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v11, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0 -; GISEL-NEXT: v_mov_b32_e32 v7, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8] +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v7, v11 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v12, vcc +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v18, 0 +; GISEL-NEXT: ; implicit-def: $vgpr13_vgpr14 ; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc +; GISEL-NEXT: v_mov_b32_e32 v13, v12 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[13:14] ; GISEL-NEXT: v_xor_b32_e32 v15, v0, v7 ; GISEL-NEXT: v_mul_lo_u32 v0, v17, v11 -; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12 -; GISEL-NEXT: v_xor_b32_e32 v16, v1, v7 -; GISEL-NEXT: v_mul_hi_u32 v1, v14, v11 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v18, v[12:13] +; GISEL-NEXT: v_mul_hi_u32 v14, v18, v11 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v18, v12 ; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v7 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12 +; GISEL-NEXT: v_mul_lo_u32 v14, v17, v12 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v13, v18, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v16, v0 -; GISEL-NEXT: v_mul_lo_u32 v12, v15, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v16, v1 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v12, v15, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v17, v11, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v1, v0 +; GISEL-NEXT: v_mul_lo_u32 v13, v15, v11 +; GISEL-NEXT: v_mul_hi_u32 v14, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v1, v0 +; GISEL-NEXT: v_mul_hi_u32 v16, v1, v11 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 +; GISEL-NEXT: v_mul_lo_u32 v14, v1, v11 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v13, v15, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 ; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v0, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v14, v[1:2] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v6 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v10, v6, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v13 +; GISEL-NEXT: ; implicit-def: $vgpr13_vgpr14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v6 +; GISEL-NEXT: v_mov_b32_e32 v13, v12 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v16, v[13:14] ; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v5, v0, v[12:13] -; GISEL-NEXT: v_xor_b32_e32 v10, v1, v6 -; GISEL-NEXT: v_xor_b32_e32 v9, v9, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v10 -; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v9 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v10, v6, vcc +; GISEL-NEXT: v_xor_b32_e32 v10, v9, v6 +; GISEL-NEXT: v_xor_b32_e32 v9, v13, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v10 +; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v9 ; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v15, v11 -; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v16, v12 -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v13 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v16, v12, vcc -; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v11, v5, vcc -; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v16, v11 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v16 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v1, v12, vcc +; GISEL-NEXT: v_mac_f32_e32 v13, 0x4f800000, v14 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v13 +; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v12 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 +; GISEL-NEXT: v_mul_f32_e32 v12, 0x2f800000, v11 +; GISEL-NEXT: v_trunc_f32_e32 v13, v12 +; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v13 +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v11 ; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v10 ; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v9, vcc ; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v8 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v8 -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v8 -; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v5 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13] +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v15, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v13 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v14, v8 +; GISEL-NEXT: ; implicit-def: $vgpr13_vgpr14 +; GISEL-NEXT: v_mul_lo_u32 v8, v15, v11 +; GISEL-NEXT: v_mov_b32_e32 v13, v12 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v15, v[13:14] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v1, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, v18, v12 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v18, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11 -; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v5 +; GISEL-NEXT: v_mul_lo_u32 v13, v18, v12 +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v18, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v11, v15, v11 +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v8, v13, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v5, v15, v21, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v14, vcc +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9] ; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v8 -; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v15, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v17, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v21, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; GISEL-NEXT: v_mul_lo_u32 v15, v16, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v15, v11 -; GISEL-NEXT: v_mul_hi_u32 v15, v18, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v21, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v13, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v8, v17, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v8, v13, v21, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_lo_u32 v14, v15, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 +; GISEL-NEXT: v_mul_hi_u32 v14, v18, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v12, v16, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; GISEL-NEXT: v_mul_hi_u32 v12, v15, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v11 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v16, v12, vcc +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v15, v12, vcc ; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v19, v13, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v15, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v14, v[0:1] ; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v8, v14, v8, vcc +; GISEL-NEXT: v_mul_hi_u32 v4, v14, v11 ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v13, v[0:1] ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v12 ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v12, vcc -; GISEL-NEXT: v_xor_b32_e32 v14, v1, v12 -; GISEL-NEXT: v_mul_lo_u32 v1, v15, v11 +; GISEL-NEXT: v_xor_b32_e32 v15, v1, v12 +; GISEL-NEXT: v_mul_lo_u32 v1, v14, v11 ; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 ; GISEL-NEXT: v_xor_b32_e32 v16, v2, v12 ; GISEL-NEXT: v_mul_hi_u32 v2, v13, v11 -; GISEL-NEXT: v_mul_hi_u32 v4, v15, v11 +; GISEL-NEXT: v_xor_b32_e32 v5, v5, v7 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v2, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -2039,25 +2103,25 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v15, v0, vcc +; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v14, v0, vcc ; GISEL-NEXT: v_mul_lo_u32 v2, v16, v1 -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 -; GISEL-NEXT: v_mul_hi_u32 v4, v14, v1 +; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v4, v15, v1 ; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 -; GISEL-NEXT: v_xor_b32_e32 v5, v5, v7 +; GISEL-NEXT: v_xor_b32_e32 v8, v8, v7 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v4, v16, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_mul_hi_u32 v3, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v3, v15, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 @@ -2069,13 +2133,13 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v1 +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v13, v[0:1] -; GISEL-NEXT: v_xor_b32_e32 v8, v8, v7 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v5, v7 ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4] ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v15, v2 ; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc ; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v9 @@ -2135,126 +2199,132 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_xor_b32_e32 v1, v10, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v10, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v11, v1 -; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v15, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v16, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 ; CGP-NEXT: v_rcp_iflag_f32_e32 v10, v10 ; CGP-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 ; CGP-NEXT: v_mul_f32_e32 v11, 0x2f800000, v10 ; CGP-NEXT: v_trunc_f32_e32 v12, v11 ; CGP-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v10 -; CGP-NEXT: v_cvt_u32_f32_e32 v16, v12 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12] -; CGP-NEXT: v_mul_hi_u32 v17, v13, v10 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] -; CGP-NEXT: v_mul_lo_u32 v12, v16, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 -; CGP-NEXT: v_mul_lo_u32 v18, v13, v11 -; CGP-NEXT: v_mul_lo_u32 v19, v16, v11 +; CGP-NEXT: v_cvt_u32_f32_e32 v14, v10 +; CGP-NEXT: v_cvt_u32_f32_e32 v17, v12 +; CGP-NEXT: ; implicit-def: $vgpr12_vgpr13 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v14, 0 +; CGP-NEXT: v_mov_b32_e32 v12, v11 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v17, v[12:13] +; CGP-NEXT: v_mul_hi_u32 v13, v14, v10 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v14, v[11:12] +; CGP-NEXT: v_mul_lo_u32 v12, v17, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v17, v10 +; CGP-NEXT: v_mul_lo_u32 v18, v14, v11 +; CGP-NEXT: v_mul_lo_u32 v19, v17, v11 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v18 ; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; CGP-NEXT: v_mul_hi_u32 v17, v13, v11 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v14, v11 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v18, v12 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v19, v10 ; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17 -; CGP-NEXT: v_mul_hi_u32 v11, v16, v11 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v18, v13 +; CGP-NEXT: v_mul_hi_u32 v11, v17, v11 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v17, v12 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v10 -; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v11, vcc -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12] -; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v14 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v14, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v8, v14 -; CGP-NEXT: v_mul_lo_u32 v8, v16, v10 -; CGP-NEXT: v_mul_lo_u32 v15, v13, v11 -; CGP-NEXT: v_xor_b32_e32 v17, v9, v14 -; CGP-NEXT: v_mul_hi_u32 v9, v13, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v10 +; CGP-NEXT: v_addc_u32_e32 v17, vcc, v17, v11, vcc +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v14, 0 +; CGP-NEXT: ; implicit-def: $vgpr12_vgpr13 +; CGP-NEXT: v_mov_b32_e32 v12, v11 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v17, v[12:13] +; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v13 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v14, v[11:12] +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v12, v8, v13 +; CGP-NEXT: v_mul_lo_u32 v8, v17, v10 +; CGP-NEXT: v_mul_lo_u32 v15, v14, v11 +; CGP-NEXT: v_xor_b32_e32 v16, v9, v13 +; CGP-NEXT: v_mul_hi_u32 v9, v14, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v17, v10 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v16, v11 +; CGP-NEXT: v_mul_lo_u32 v9, v17, v11 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8 -; CGP-NEXT: v_mul_hi_u32 v15, v13, v11 +; CGP-NEXT: v_mul_hi_u32 v15, v14, v11 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v15 -; CGP-NEXT: v_mul_hi_u32 v11, v16, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v17, v11 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v16, v9, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v17, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v14, v8 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v17, v9, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v16, v8 ; CGP-NEXT: v_mul_lo_u32 v11, v12, v9 -; CGP-NEXT: v_mul_hi_u32 v13, v12, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v17, v8 -; CGP-NEXT: v_mul_hi_u32 v15, v17, v9 +; CGP-NEXT: v_mul_hi_u32 v14, v12, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v16, v8 +; CGP-NEXT: v_mul_hi_u32 v15, v16, v9 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v14 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v17, v9 +; CGP-NEXT: v_mul_lo_u32 v14, v16, v9 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_mul_hi_u32 v11, v12, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v14, v8 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v8, v10 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v13, 0 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v14, v11 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v8, v10 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v14, 0 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v15, v10 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v11, v[9:10] +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v10 +; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 ; CGP-NEXT: v_sub_i32_e32 v8, vcc, v12, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v1, v13, v[9:10] -; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v17, v9, vcc -; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v17, v9 +; CGP-NEXT: v_mov_b32_e32 v10, v9 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v15, v[10:11] +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v1, v14, v[9:10] +; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v16, v9, vcc +; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v16, v9 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1 ; CGP-NEXT: v_subb_u32_e32 v9, vcc, v9, v1, vcc -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v4 ; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v4 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v1 ; CGP-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc -; CGP-NEXT: v_cndmask_b32_e64 v10, v12, v15, s[4:5] -; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v13 -; CGP-NEXT: v_addc_u32_e32 v15, vcc, 0, v11, vcc +; CGP-NEXT: v_cndmask_b32_e64 v10, v11, v12, s[4:5] +; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v14 +; CGP-NEXT: v_addc_u32_e32 v12, vcc, 0, v15, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v1 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v9, v1 ; CGP-NEXT: v_cndmask_b32_e32 v1, v16, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v12 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v15, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v11 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v12, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v15, v8, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v12, v8, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v8, v14, v0 -; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc +; CGP-NEXT: v_xor_b32_e32 v8, v13, v0 +; CGP-NEXT: v_cndmask_b32_e32 v4, v15, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v1, v8 ; CGP-NEXT: v_xor_b32_e32 v1, v4, v8 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 @@ -2318,72 +2388,74 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 ; CGP-NEXT: v_trunc_f32_e32 v10, v8 ; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v10 -; CGP-NEXT: v_cvt_u32_f32_e32 v11, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v14, v10 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 -; CGP-NEXT: v_mov_b32_e32 v6, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7] -; CGP-NEXT: v_mul_lo_u32 v6, v14, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v10, v11, v8 +; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v6, 0 +; CGP-NEXT: v_mov_b32_e32 v10, v9 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[10:11] +; CGP-NEXT: v_mul_hi_u32 v11, v6, v8 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v6, v[9:10] +; CGP-NEXT: v_mul_lo_u32 v10, v14, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 -; CGP-NEXT: v_mul_lo_u32 v15, v11, v9 +; CGP-NEXT: v_mul_lo_u32 v15, v6, v9 ; CGP-NEXT: v_mul_lo_u32 v16, v14, v9 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v15 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v15, v6 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v6, v9 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v16, v8 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v15, v11 +; CGP-NEXT: v_mul_hi_u32 v9, v14, v9 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10 -; CGP-NEXT: v_mul_hi_u32 v9, v14, v9 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v6 -; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v8, vcc -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 -; CGP-NEXT: v_mov_b32_e32 v6, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7] -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v10, v5, v12 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v9, vcc +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v6, 0 +; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 +; CGP-NEXT: v_mov_b32_e32 v10, v9 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[10:11] +; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v6, v[9:10] +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v11, vcc +; CGP-NEXT: v_xor_b32_e32 v10, v5, v11 ; CGP-NEXT: v_mul_lo_u32 v5, v14, v8 -; CGP-NEXT: v_mul_lo_u32 v7, v11, v9 -; CGP-NEXT: v_xor_b32_e32 v13, v6, v12 -; CGP-NEXT: v_mul_hi_u32 v6, v11, v8 +; CGP-NEXT: v_mul_lo_u32 v12, v6, v9 +; CGP-NEXT: v_xor_b32_e32 v13, v7, v11 +; CGP-NEXT: v_mul_hi_u32 v7, v6, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v14, v9 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v7, v11, v9 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CGP-NEXT: v_mul_lo_u32 v7, v14, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; CGP-NEXT: v_mul_hi_u32 v12, v6, v9 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; CGP-NEXT: v_mul_hi_u32 v9, v14, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v14, v9 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v14, v6, vcc +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v14, v7, vcc ; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 ; CGP-NEXT: v_mul_lo_u32 v8, v10, v6 ; CGP-NEXT: v_mul_hi_u32 v9, v10, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 -; CGP-NEXT: v_mul_hi_u32 v11, v13, v6 +; CGP-NEXT: v_mul_hi_u32 v12, v13, v6 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 @@ -2400,38 +2472,40 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, 0 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v7 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v8, v[6:7] +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v7 +; CGP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_mov_b32_e32 v7, v6 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v12, v[7:8] ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v9, v[6:7] ; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v13, v6, vcc ; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v13, v6 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 ; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v4 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3 ; CGP-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc -; CGP-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[4:5] -; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v9 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc +; CGP-NEXT: v_cndmask_b32_e64 v7, v8, v10, s[4:5] +; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v9 +; CGP-NEXT: v_addc_u32_e32 v10, vcc, 0, v12, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 ; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v10 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v11, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v8 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v10, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v10, v5, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v12, v2 -; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v11, v2 +; CGP-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v2, v3, v5 ; CGP-NEXT: v_xor_b32_e32 v3, v4, v5 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 @@ -2545,8 +2619,10 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v7 ; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v3 ; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v7 +; GISEL-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[4:5] +; GISEL-NEXT: v_mov_b32_e32 v7, v4 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[7:8] ; GISEL-NEXT: v_mul_lo_u32 v4, v12, v3 ; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] ; GISEL-NEXT: v_mul_hi_u32 v8, v9, v3 @@ -2572,7 +2648,9 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v3 ; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v4, vcc ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[4:5] +; GISEL-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GISEL-NEXT: v_mov_b32_e32 v7, v4 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[7:8] ; GISEL-NEXT: v_mul_lo_u32 v4, v12, v3 ; GISEL-NEXT: v_and_b32_e32 v10, 0xffffff, v0 ; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] @@ -2614,122 +2692,127 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v0, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v3 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v4, v[7:8] +; GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GISEL-NEXT: v_mov_b32_e32 v8, v7 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v4, v[8:9] +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v3 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v0, v[7:8] ; GISEL-NEXT: v_mac_f32_e32 v9, 0x4f800000, v5 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v9 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v0, v[7:8] -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v10, v6 +; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v10, v6 +; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, v7, vcc ; GISEL-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2 -; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], 0, v7, vcc -; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 0, v7 +; GISEL-NEXT: v_sub_i32_e64 v12, s[4:5], 0, v7 ; GISEL-NEXT: v_trunc_f32_e32 v7, v5 ; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v2 +; GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v3 ; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v2, 0 ; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v7 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 +; GISEL-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1 +; GISEL-NEXT: v_mov_b32_e32 v7, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[2:3] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, -1, v16, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v10, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v15, v5 -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v6 -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v8, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v2, vcc -; GISEL-NEXT: v_mul_hi_u32 v2, v12, v5 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[7:8] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v8, -1, v16, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v2, v[6:7] +; GISEL-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v12, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v15, v5 +; GISEL-NEXT: v_mul_lo_u32 v12, v2, v6 +; GISEL-NEXT: v_mul_hi_u32 v16, v2, v5 +; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v9, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v15, v6 +; GISEL-NEXT: v_mul_lo_u32 v16, v15, v6 ; GISEL-NEXT: v_mul_hi_u32 v5, v15, v5 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; GISEL-NEXT: v_mul_hi_u32 v12, v2, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v16, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 +; GISEL-NEXT: v_mul_hi_u32 v6, v15, v6 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_mul_hi_u32 v6, v15, v6 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v12, v2 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v15, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v0 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v4, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v1 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v2, v5 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v15, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v10, 0 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v0 +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v4, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v1 +; GISEL-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc ; GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v13, v10, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v14, v7, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v12 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v15, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v10, v5 -; GISEL-NEXT: v_mul_lo_u32 v14, v7, v1 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v13, v12, v[1:2] +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v14, v10, v[1:2] +; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v15 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v16, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v12, v5 +; GISEL-NEXT: v_mul_lo_u32 v13, v10, v1 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v13, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v12, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v12, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v5, v10, v5 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v13, v6 -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v1 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v12, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; GISEL-NEXT: v_mul_hi_u32 v1, v10, v1 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v15, v2, vcc +; GISEL-NEXT: v_mul_hi_u32 v2, v10, v5 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v9, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v6, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v6, v12, v1 +; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5 +; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v9, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v10, v1 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v6, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v12, v6 -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v6 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 -; GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], v10, v1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v6, 0, v5 -; GISEL-NEXT: v_mul_lo_u32 v7, v11, v1 -; GISEL-NEXT: v_mul_hi_u32 v10, v11, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v8, vcc -; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; GISEL-NEXT: v_mul_lo_u32 v7, 0, v1 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v11, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v9 +; GISEL-NEXT: v_mul_hi_u32 v1, v12, v1 +; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v5, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v6, v5 +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v5 +; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v10, v2 +; GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], v12, v1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v5, 0, v2 +; GISEL-NEXT: v_mul_lo_u32 v6, v11, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v9, v16, v7, vcc +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v2 +; GISEL-NEXT: v_mul_hi_u32 v2, 0, v2 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_mul_lo_u32 v6, 0, v1 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v5, v6 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v2, v5 ; GISEL-NEXT: v_mul_hi_u32 v1, 0, v1 ; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v10, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v1, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v1, v2 +; GISEL-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; GISEL-NEXT: v_mov_b32_e32 v1, v6 ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[1:2] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v10, v[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v5 ; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v6 ; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v6, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index 1441591a5fcce..0fba42ac9575b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -175,60 +175,64 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_trunc_f32_e32 v2, v1 ; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v2 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v4, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v5, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v3, v4, v0 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v4, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v2, v5, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v5, v0 +; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1 +; GFX8-NEXT: v_mul_lo_u32 v7, v5, v1 +; GFX8-NEXT: v_mul_hi_u32 v8, v4, v1 +; GFX8-NEXT: v_mul_hi_u32 v1, v5, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v4, 0 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v5, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v6, v4, v0 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v4, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v2, v5, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v5, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1 +; GFX8-NEXT: v_mul_lo_u32 v6, v5, v1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v4, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v6, v3 +; GFX8-NEXT: v_mul_hi_u32 v1, v5, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc ; GFX8-NEXT: v_mul_lo_u32 v2, s11, v0 ; GFX8-NEXT: v_mul_lo_u32 v3, s10, v1 ; GFX8-NEXT: v_mul_hi_u32 v4, s10, v0 @@ -250,12 +254,14 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2] +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v2 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: v_mov_b32_e32 v6, s11 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v5, v[2:3] ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s10, v0 ; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: v_subb_u32_e64 v2, s[0:1], v6, v1, vcc ; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s11, v1 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 @@ -263,19 +269,19 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v2 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[0:1] ; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s8, v0 ; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v1, vcc ; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v4 -; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] +; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v5, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s8, v7 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9 ; GFX8-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc @@ -284,19 +290,19 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v0, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v0, v3, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[0:1] ; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[12:13] ; GFX8-NEXT: v_xor_b32_e32 v0, s0, v4 -; GFX8-NEXT: v_xor_b32_e32 v1, s1, v3 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_xor_b32_e32 v1, s1, v5 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; GFX8-NEXT: v_xor_b32_e32 v3, s2, v5 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3 ; GFX8-NEXT: v_xor_b32_e32 v4, s2, v2 ; GFX8-NEXT: v_mov_b32_e32 v5, s2 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s2, v3 @@ -335,59 +341,63 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v2 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v5, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v3, v4, v0 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v2, v5, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v5, v0 +; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1 +; GFX9-NEXT: v_mul_lo_u32 v7, v5, v1 +; GFX9-NEXT: v_mul_hi_u32 v8, v4, v1 +; GFX9-NEXT: v_mul_hi_u32 v1, v5, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v2, v6, v2 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX9-NEXT: v_add_u32_e32 v3, v7, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0 +; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: v_mov_b32_e32 v7, s7 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v5, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v6, v4, v0 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v2, v5, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v5, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX9-NEXT: v_mul_lo_u32 v6, v5, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v4, v1 +; GFX9-NEXT: v_mul_hi_u32 v1, v5, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX9-NEXT: v_add_u32_e32 v3, v6, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, s9, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0 @@ -408,25 +418,27 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v5, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-NEXT: v_add3_u32 v4, v3, v2, v6 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s8, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v4, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v5, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v6, v1, vcc +; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v3, v1, vcc ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v2 ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[0:1] ; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s6, v0 ; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1] +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v4, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v8 @@ -440,32 +452,33 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v8, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v11, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v0, v5, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[0:1] ; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] -; GFX9-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX9-NEXT: v_xor_b32_e32 v1, s1, v3 +; GFX9-NEXT: v_xor_b32_e32 v0, s0, v3 +; GFX9-NEXT: v_xor_b32_e32 v1, s1, v4 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v3, s2, v6 -; GFX9-NEXT: v_xor_b32_e32 v5, s2, v2 -; GFX9-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NEXT: v_xor_b32_e32 v3, s2, v5 +; GFX9-NEXT: v_xor_b32_e32 v4, s2, v2 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[14:15] +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v5, vcc +; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[12:13] +; GFX9-NEXT: global_store_dwordx2 v6, v[2:3], s[14:15] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdivrem_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s2, s17, 31 ; GFX10-NEXT: s_ashr_i32 s4, s19, 31 @@ -485,62 +498,65 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX10-NEXT: v_trunc_f32_e32 v2, v1 -; GFX10-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v2 +; GFX10-NEXT: v_trunc_f32_e32 v4, v1 +; GFX10-NEXT: v_mul_f32_e32 v1, 0xcf800000, v4 +; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX10-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s9, s8, v3, 0 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, s8, v4, v[1:2] -; GFX10-NEXT: s_subb_u32 s9, 0, s7 +; GFX10-NEXT: v_cvt_u32_f32_e32 v5, v0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s9, s8, v5, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 ; GFX10-NEXT: v_mul_hi_u32 v6, v4, v0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, s8, v4, v[2:3] +; GFX10-NEXT: s_subb_u32 s9, 0, s7 ; GFX10-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s10, s9, v3, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s10, s9, v5, v[1:2] ; GFX10-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX10-NEXT: v_mul_hi_u32 v0, v3, v0 -; GFX10-NEXT: v_mul_lo_u32 v5, v3, v1 +; GFX10-NEXT: v_mul_hi_u32 v0, v5, v0 +; GFX10-NEXT: v_mul_lo_u32 v3, v5, v1 ; GFX10-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX10-NEXT: v_mul_hi_u32 v8, v3, v1 +; GFX10-NEXT: v_mul_hi_u32 v8, v5, v1 ; GFX10-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX10-NEXT: v_add_co_u32 v2, s10, v2, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s10 +; GFX10-NEXT: v_add_co_u32 v2, s10, v2, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s10 ; GFX10-NEXT: v_add_co_u32 v6, s10, v7, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s10 ; GFX10-NEXT: v_add_co_u32 v0, s10, v2, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s10 ; GFX10-NEXT: v_add_co_u32 v2, s10, v6, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s10 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v5, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v7, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v3, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v7, v6 ; GFX10-NEXT: v_add_co_u32 v0, s10, v2, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s10 -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v3, v0 -; GFX10-NEXT: v_add3_u32 v1, v5, v2, v1 +; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v5, v0 +; GFX10-NEXT: v_add3_u32 v1, v3, v2, v1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v1, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s10, s8, v3, 0 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s8, s8, v4, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s10, s8, v5, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 ; GFX10-NEXT: v_mul_hi_u32 v6, v4, v0 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s8, s9, v3, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s8, s8, v4, v[2:3] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s8, s9, v5, v[1:2] ; GFX10-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX10-NEXT: v_mul_hi_u32 v0, v3, v0 -; GFX10-NEXT: v_mul_lo_u32 v5, v3, v1 +; GFX10-NEXT: v_mul_hi_u32 v0, v5, v0 +; GFX10-NEXT: v_mul_lo_u32 v3, v5, v1 ; GFX10-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX10-NEXT: v_mul_hi_u32 v8, v3, v1 +; GFX10-NEXT: v_mul_hi_u32 v8, v5, v1 ; GFX10-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX10-NEXT: v_add_co_u32 v2, s8, v2, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s8 +; GFX10-NEXT: v_add_co_u32 v2, s8, v2, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s8 ; GFX10-NEXT: v_add_co_u32 v6, s8, v7, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s8 ; GFX10-NEXT: v_add_co_u32 v0, s8, v2, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s8 ; GFX10-NEXT: v_add_co_u32 v2, s8, v6, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s8 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v5, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v7, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v3, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v7, v6 ; GFX10-NEXT: v_add_co_u32 v0, s8, v2, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s8 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v3, v0 -; GFX10-NEXT: v_add3_u32 v1, v5, v2, v1 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v5, v0 +; GFX10-NEXT: v_add3_u32 v1, v3, v2, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, s1, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo ; GFX10-NEXT: v_mul_hi_u32 v4, s0, v0 @@ -557,16 +573,18 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_add_nc_u32_e32 v2, v6, v2 ; GFX10-NEXT: v_add_co_u32 v0, s8, v0, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s8 +; GFX10-NEXT: v_mul_hi_u32 v6, s1, v1 ; GFX10-NEXT: v_add_co_u32 v5, s8, v0, v2 -; GFX10-NEXT: v_mul_hi_u32 v2, s1, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s8 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s8 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v4, v3 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s8, s6, v5, 0 -; GFX10-NEXT: v_add3_u32 v3, v3, v6, v2 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s8, s6, v3, v[1:2] +; GFX10-NEXT: v_add3_u32 v4, v4, v7, v6 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s8, s6, v4, v[2:3] ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s8, s7, v5, v[1:2] ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v5, 1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v4, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, s1, v1 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v1, s0, s1, v1, vcc_lo @@ -583,7 +601,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0 ; GFX10-NEXT: v_add_co_u32 v13, s0, v2, 1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s0, 0, v4, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s0, 0, v3, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v11, v12, v11, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v1 @@ -593,11 +611,11 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v6, s0, 0, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v13, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v14, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v7, v8, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, v3, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 @@ -1311,63 +1329,67 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_trunc_f32_e32 v2, v1 ; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v2 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v4, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v5, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v3, v4, v0 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v4, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v2, v5, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v5, v0 +; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1 +; GFX8-NEXT: v_mul_lo_u32 v7, v5, v1 +; GFX8-NEXT: v_mul_hi_u32 v8, v4, v1 +; GFX8-NEXT: v_mul_hi_u32 v1, v5, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v4, 0 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v5, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v6, v4, v0 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v4, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v2, v5, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v5, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, v4, v1 ; GFX8-NEXT: s_xor_b64 s[16:17], s[4:5], s[6:7] ; GFX8-NEXT: s_ashr_i32 s6, s19, 31 ; GFX8-NEXT: s_mov_b32 s7, s6 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1 +; GFX8-NEXT: v_mul_lo_u32 v6, v5, v1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v4, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v6, v3 +; GFX8-NEXT: v_mul_hi_u32 v1, v5, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc ; GFX8-NEXT: v_mul_lo_u32 v2, s11, v0 ; GFX8-NEXT: v_mul_lo_u32 v3, s10, v1 ; GFX8-NEXT: v_mul_hi_u32 v4, s10, v0 @@ -1389,26 +1411,28 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2] +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v2 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: v_mov_b32_e32 v6, s11 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v5, v[2:3] ; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s10, v0 ; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: s_ashr_i32 s10, s3, 31 ; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v1, vcc ; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s11, v1 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 ; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s8, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1] ; GFX8-NEXT: v_subbrev_u32_e64 v9, s[0:1], 0, v0, vcc ; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], 1, v4 -; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] +; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v5, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8 @@ -1424,15 +1448,15 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: s_addc_u32 s3, s3, s10 ; GFX8-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] ; GFX8-NEXT: v_cvt_f32_u32_e32 v14, s3 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc -; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s2 +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v3, vcc +; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s2 ; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s8, v8 ; GFX8-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v0, vcc ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v14 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v1, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v12, vcc ; GFX8-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 @@ -1441,29 +1465,29 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v0 ; GFX8-NEXT: s_sub_u32 s5, 0, s2 +; GFX8-NEXT: v_cvt_u32_f32_e32 v11, v11 ; GFX8-NEXT: s_subb_u32 s20, 0, s3 -; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v12, 0 ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] -; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v10, v3, v10, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[0:1] +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s5, v11, v[2:3] ; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v15, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s5, v5, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v3, v5, v0 ; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s20, v12, v[1:2] ; GFX8-NEXT: v_cndmask_b32_e32 v2, v9, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1] +; GFX8-NEXT: v_mul_lo_u32 v3, v11, v0 ; GFX8-NEXT: v_mul_lo_u32 v8, v12, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1] ; GFX8-NEXT: v_mul_hi_u32 v2, v12, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v5, v0 -; GFX8-NEXT: v_xor_b32_e32 v9, s17, v10 +; GFX8-NEXT: v_mul_hi_u32 v0, v11, v0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v3, v5, v1 +; GFX8-NEXT: v_mul_lo_u32 v3, v11, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 ; GFX8-NEXT: v_mul_hi_u32 v8, v12, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 @@ -1471,73 +1495,78 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8 -; GFX8-NEXT: v_mul_hi_u32 v1, v5, v1 +; GFX8-NEXT: v_mul_hi_u32 v1, v11, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v12, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[0:1] ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v8, 0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; GFX8-NEXT: v_xor_b32_e32 v1, s16, v4 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v11, v1, vcc +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: v_xor_b32_e32 v10, s16, v4 ; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v5, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v10, s17 -; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s16, v1 +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v9, v[0:1] +; GFX8-NEXT: v_xor_b32_e32 v5, s17, v5 +; GFX8-NEXT: v_mov_b32_e32 v11, s17 ; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v8, v[3:4] -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v9, v10, vcc +; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s16, v10 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v5, v11, vcc ; GFX8-NEXT: v_xor_b32_e32 v4, s4, v7 -; GFX8-NEXT: v_mul_lo_u32 v7, v5, v2 -; GFX8-NEXT: v_mul_lo_u32 v9, v8, v3 +; GFX8-NEXT: v_xor_b32_e32 v5, s4, v6 +; GFX8-NEXT: v_mul_lo_u32 v6, v9, v2 +; GFX8-NEXT: v_mul_lo_u32 v7, v8, v3 ; GFX8-NEXT: v_mul_hi_u32 v11, v8, v2 -; GFX8-NEXT: v_mul_hi_u32 v2, v5, v2 -; GFX8-NEXT: v_xor_b32_e32 v6, s4, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v11 +; GFX8-NEXT: v_mul_hi_u32 v2, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v10, s4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v11, v5, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 -; GFX8-NEXT: v_mul_hi_u32 v9, v8, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v11 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v11, v9, v3 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 +; GFX8-NEXT: v_mul_hi_u32 v7, v8, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v11, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v11, v9 -; GFX8-NEXT: v_mul_hi_u32 v3, v5, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v11, v7 +; GFX8-NEXT: v_mul_hi_u32 v3, v9, v3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v6 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; GFX8-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NEXT: v_mul_lo_u32 v7, s9, v2 -; GFX8-NEXT: v_mul_lo_u32 v8, s8, v3 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v9, v3, vcc +; GFX8-NEXT: v_mul_lo_u32 v6, s9, v2 +; GFX8-NEXT: v_mul_lo_u32 v7, s8, v3 +; GFX8-NEXT: v_mul_hi_u32 v8, s8, v2 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v10, vcc -; GFX8-NEXT: v_mul_hi_u32 v6, s8, v2 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 +; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v5, v10, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, s9, v3 +; GFX8-NEXT: v_mul_lo_u32 v8, s9, v3 ; GFX8-NEXT: v_mul_hi_u32 v2, s9, v2 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 -; GFX8-NEXT: v_mul_hi_u32 v8, s8, v3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v8 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 +; GFX8-NEXT: v_mul_hi_u32 v7, s8, v3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v2, v6 ; GFX8-NEXT: v_mul_hi_u32 v9, s9, v3 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v8, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v9, v[3:4] +; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX8-NEXT: v_mov_b32_e32 v10, s9 +; GFX8-NEXT: v_mov_b32_e32 v6, v3 +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v9, v[6:7] ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s8, v2 ; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v8, v[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, s3 @@ -1622,66 +1651,70 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v2 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v5, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v3, v4, v0 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v4, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v2, v5, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v5, v0 +; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1 +; GFX9-NEXT: v_mul_lo_u32 v7, v5, v1 +; GFX9-NEXT: v_mul_hi_u32 v8, v4, v1 +; GFX9-NEXT: v_mul_hi_u32 v1, v5, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v2, v6, v2 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX9-NEXT: v_add_u32_e32 v3, v7, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 +; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v4, 0 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v5, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v6, v4, v0 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v4, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v2, v5, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v5, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v4, v1 ; GFX9-NEXT: s_xor_b64 s[16:17], s[4:5], s[6:7] ; GFX9-NEXT: s_ashr_i32 s6, s19, 31 ; GFX9-NEXT: s_mov_b32 s7, s6 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX9-NEXT: v_mul_lo_u32 v6, v5, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v4, v1 +; GFX9-NEXT: v_mul_hi_u32 v1, v5, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX9-NEXT: v_add_u32_e32 v3, v6, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, s11, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s11, v0 -; GFX9-NEXT: v_mul_hi_u32 v6, s11, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s11, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 @@ -1693,37 +1726,39 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v6, s11 -; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, s10, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2] +; GFX9-NEXT: v_add3_u32 v7, v3, v2, v5 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v7, v[2:3] +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s10, v0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s9, v6, v[1:2] ; GFX9-NEXT: v_mov_b32_e32 v4, s9 ; GFX9-NEXT: s_ashr_i32 s10, s3, 31 -; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc -; GFX9-NEXT: v_sub_u32_e32 v0, s11, v1 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 -; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s8, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v2, s[0:1] -; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[0:1], 0, v0, vcc -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1] +; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v5, v2, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, s11, v2 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v0 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v1 +; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s8, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v8, v3, v5, s[0:1] +; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[0:1], 0, v2, vcc +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], 1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v7, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v12, v1, v12, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v12, v3, v12, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v5 ; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s18, s6 ; GFX9-NEXT: s_addc_u32 s1, s19, s6 @@ -1731,118 +1766,123 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: s_mov_b32 s11, s10 ; GFX9-NEXT: s_addc_u32 s3, s3, s10 ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v15, s2 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc -; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v15 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s8, v9 -; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc -; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX9-NEXT: v_trunc_f32_e32 v16, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v16 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v0 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f800000, v3 +; GFX9-NEXT: v_add_f32_e32 v3, v3, v15 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX9-NEXT: v_subrev_co_u32_e32 v15, vcc, s8, v9 +; GFX9-NEXT: v_subbrev_co_u32_e32 v16, vcc, 0, v2, vcc +; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v3 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 +; GFX9-NEXT: v_trunc_f32_e32 v4, v3 +; GFX9-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4 +; GFX9-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v2 ; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] ; GFX9-NEXT: s_sub_u32 s5, 0, s2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v17, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v2, v13, vcc -; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v16 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v17, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v5, v13, vcc +; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v4 +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX9-NEXT: s_subb_u32 s20, 0, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v13, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v13, v[4:5] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v3, v11, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s20, v17, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v13, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v15, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v17, v1 -; GFX9-NEXT: v_mul_hi_u32 v10, v17, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v13, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[0:1] -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v10, v13, v1 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, v17, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v13, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v5, v6, v12, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[18:19], s20, v17, v[3:4] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v7, v13, v2 +; GFX9-NEXT: v_mul_lo_u32 v8, v17, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v15, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v16, vcc +; GFX9-NEXT: v_mul_hi_u32 v10, v17, v2 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v10, v13, v3 +; GFX9-NEXT: v_mul_hi_u32 v2, v13, v2 +; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 +; GFX9-NEXT: v_mul_hi_u32 v8, v17, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, v13, v3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v3, v10, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v17, v0 -; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v10, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v13, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 +; GFX9-NEXT: v_add_u32_e32 v8, v10, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v3, v8, v7, v3 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v17, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v13, v3, vcc +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v7, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v1, v9, s[0:1] +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: v_xor_b32_e32 v10, s17, v4 ; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v4, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v11, v[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v8, v[0:1] ; GFX9-NEXT: v_xor_b32_e32 v5, s16, v5 -; GFX9-NEXT: v_xor_b32_e32 v8, s17, v8 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v10, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v9, s17 +; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v7, v[0:1] ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v5 -; GFX9-NEXT: v_xor_b32_e32 v4, s4, v7 -; GFX9-NEXT: v_mul_lo_u32 v5, v11, v2 -; GFX9-NEXT: v_mul_lo_u32 v7, v10, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v8, v9, vcc -; GFX9-NEXT: v_mul_hi_u32 v8, v10, v2 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 +; GFX9-NEXT: v_xor_b32_e32 v4, s4, v6 +; GFX9-NEXT: v_mul_lo_u32 v5, v8, v2 +; GFX9-NEXT: v_mul_lo_u32 v6, v7, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v10, v11, vcc +; GFX9-NEXT: v_mul_hi_u32 v10, v7, v2 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, v11, v3 -; GFX9-NEXT: v_mul_hi_u32 v2, v11, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v10, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, v11, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v10, v8, v3 +; GFX9-NEXT: v_mul_hi_u32 v2, v8, v2 +; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX9-NEXT: v_mul_hi_u32 v6, v7, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, v8, v3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 +; GFX9-NEXT: v_add_u32_e32 v6, v10, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v3, v7, v5, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v11, v3, vcc +; GFX9-NEXT: v_add3_u32 v3, v6, v5, v3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc ; GFX9-NEXT: v_mul_lo_u32 v5, s9, v2 -; GFX9-NEXT: v_mul_lo_u32 v7, s8, v3 -; GFX9-NEXT: v_mul_hi_u32 v9, s8, v2 +; GFX9-NEXT: v_mul_lo_u32 v6, s8, v3 +; GFX9-NEXT: v_mul_hi_u32 v8, s8, v2 ; GFX9-NEXT: v_mul_hi_u32 v2, s9, v2 ; GFX9-NEXT: v_mul_hi_u32 v12, s9, v3 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v9, s9, v3 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, s8, v3 -; GFX9-NEXT: v_xor_b32_e32 v6, s4, v6 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, s9, v3 +; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX9-NEXT: v_mul_hi_u32 v6, s8, v3 +; GFX9-NEXT: v_xor_b32_e32 v9, s4, v9 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v2, v5 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0 -; GFX9-NEXT: v_mov_b32_e32 v8, s4 +; GFX9-NEXT: v_mov_b32_e32 v7, s4 ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s4, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v8, vcc -; GFX9-NEXT: v_add_u32_e32 v6, v9, v7 +; GFX9-NEXT: v_add_u32_e32 v6, v8, v6 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v9, v7, vcc ; GFX9-NEXT: v_add3_u32 v8, v6, v11, v12 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v8, v[3:4] +; GFX9-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX9-NEXT: v_mov_b32_e32 v9, s9 +; GFX9-NEXT: v_mov_b32_e32 v6, v3 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v8, v[6:7] ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v2 ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v10, v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v3, s3 @@ -1937,259 +1977,268 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GFX10-NEXT: v_trunc_f32_e32 v2, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v1 -; GFX10-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX10-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v2 +; GFX10-NEXT: v_trunc_f32_e32 v4, v2 +; GFX10-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v1 +; GFX10-NEXT: v_mul_f32_e32 v1, 0xcf800000, v4 +; GFX10-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 +; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v4 ; GFX10-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX10-NEXT: v_trunc_f32_e32 v6, v4 +; GFX10-NEXT: v_trunc_f32_e32 v6, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v0 -; GFX10-NEXT: v_mul_f32_e32 v4, 0xcf800000, v6 +; GFX10-NEXT: v_mul_f32_e32 v3, 0xcf800000, v6 +; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v6 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s21, v7, 0 -; GFX10-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_add_f32_e32 v5, v3, v2 ; GFX10-NEXT: s_sub_u32 s5, 0, s2 -; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v3 -; GFX10-NEXT: v_mul_hi_u32 v10, v9, v0 -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s22, s5, v8, 0 -; GFX10-NEXT: v_mad_u64_u32 v[4:5], s22, s21, v9, v[1:2] -; GFX10-NEXT: v_cvt_u32_f32_e32 v5, v6 -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v6, v7, v0 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v5 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_mul_hi_u32 v11, v7, v0 +; GFX10-NEXT: v_mul_hi_u32 v12, v9, v0 +; GFX10-NEXT: v_mad_u64_u32 v[4:5], s22, s5, v8, 0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s22, s21, v9, v[2:3] +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX10-NEXT: s_subb_u32 s22, 0, s3 -; GFX10-NEXT: v_mul_hi_u32 v12, v8, v2 -; GFX10-NEXT: v_mul_lo_u32 v11, v5, v2 -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s23, s20, v7, v[4:5] -; GFX10-NEXT: v_mul_lo_u32 v4, v9, v0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s23, s5, v5, v[1:2] -; GFX10-NEXT: v_mul_hi_u32 v2, v5, v2 -; GFX10-NEXT: v_mul_lo_u32 v13, v7, v3 -; GFX10-NEXT: v_mul_lo_u32 v14, v9, v3 -; GFX10-NEXT: v_mul_hi_u32 v15, v7, v3 +; GFX10-NEXT: v_mov_b32_e32 v2, v5 +; GFX10-NEXT: v_mad_u64_u32 v[5:6], s23, s20, v7, v[1:2] +; GFX10-NEXT: v_mul_lo_u32 v6, v9, v0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s23, s5, v10, v[2:3] +; GFX10-NEXT: v_mul_lo_u32 v2, v10, v4 +; GFX10-NEXT: v_mul_hi_u32 v3, v8, v4 +; GFX10-NEXT: v_mul_hi_u32 v4, v10, v4 +; GFX10-NEXT: v_mul_lo_u32 v13, v7, v5 +; GFX10-NEXT: v_mul_lo_u32 v14, v9, v5 +; GFX10-NEXT: v_mul_hi_u32 v15, v7, v5 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s23, s22, v8, v[0:1] -; GFX10-NEXT: v_mul_hi_u32 v1, v9, v3 -; GFX10-NEXT: v_add_co_u32 v3, s23, v4, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v10, s23, v14, v10 +; GFX10-NEXT: v_mul_hi_u32 v1, v9, v5 +; GFX10-NEXT: v_add_co_u32 v5, s23, v6, v13 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s23 +; GFX10-NEXT: v_add_co_u32 v12, s23, v14, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s23 ; GFX10-NEXT: v_mul_lo_u32 v14, v8, v0 -; GFX10-NEXT: v_add_co_u32 v3, s23, v3, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v6, s23, v10, v15 -; GFX10-NEXT: v_mul_lo_u32 v15, v5, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s23 +; GFX10-NEXT: v_add_co_u32 v5, s23, v5, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s23 +; GFX10-NEXT: v_add_co_u32 v11, s23, v12, v15 +; GFX10-NEXT: v_mul_lo_u32 v15, v10, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s23 +; GFX10-NEXT: v_add_nc_u32_e32 v5, v6, v5 +; GFX10-NEXT: v_add_co_u32 v2, s23, v2, v14 ; GFX10-NEXT: v_mul_hi_u32 v16, v8, v0 -; GFX10-NEXT: v_mul_hi_u32 v17, v5, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v4, v3 -; GFX10-NEXT: v_add_co_u32 v4, s23, v11, v14 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v13, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v2, s23, v15, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v0, s23, v6, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v4, s23, v4, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s23 -; GFX10-NEXT: v_add_co_u32 v2, s23, v2, v16 -; GFX10-NEXT: v_add3_u32 v1, v3, v6, v1 -; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v7, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v10, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v13, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s23 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v9, v1, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s23, s21, v6, 0 +; GFX10-NEXT: v_add_co_u32 v4, s23, v15, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s23 +; GFX10-NEXT: v_add_co_u32 v5, s23, v11, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s23 ; GFX10-NEXT: v_add_co_u32 v2, s23, v2, v3 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v11, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s23 -; GFX10-NEXT: v_mov_b32_e32 v10, 0 -; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v8, v2 -; GFX10-NEXT: v_mul_hi_u32 v11, v7, v0 -; GFX10-NEXT: v_add3_u32 v3, v4, v3, v17 -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v5, v3, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s23, s5, v8, 0 -; GFX10-NEXT: v_mad_u64_u32 v[4:5], s21, s21, v7, v[1:2] -; GFX10-NEXT: v_mov_b32_e32 v1, v3 -; GFX10-NEXT: v_mul_lo_u32 v12, v9, v2 -; GFX10-NEXT: v_mul_hi_u32 v13, v8, v2 -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s20, s20, v6, v[4:5] -; GFX10-NEXT: v_mul_lo_u32 v4, v7, v0 -; GFX10-NEXT: v_mul_hi_u32 v5, v6, v0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s5, v9, v[1:2] -; GFX10-NEXT: v_mul_hi_u32 v2, v9, v2 -; GFX10-NEXT: v_mul_lo_u32 v14, v6, v3 -; GFX10-NEXT: v_mul_lo_u32 v15, v7, v3 -; GFX10-NEXT: v_mul_hi_u32 v16, v6, v3 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s22, v8, v[0:1] -; GFX10-NEXT: v_mul_hi_u32 v1, v7, v3 -; GFX10-NEXT: v_add_co_u32 v3, s5, v4, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v11, s5, v15, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v3, s5, v3, v5 -; GFX10-NEXT: v_mul_lo_u32 v15, v8, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s23 +; GFX10-NEXT: v_add_co_u32 v3, s23, v4, v16 +; GFX10-NEXT: v_add3_u32 v1, v6, v11, v1 +; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v7, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v5, v12, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s23 +; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v9, v1, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s23, s21, v11, 0 +; GFX10-NEXT: v_add_co_u32 v5, s23, v3, v5 +; GFX10-NEXT: v_mul_hi_u32 v0, v10, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v13, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s23 +; GFX10-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX10-NEXT: v_add_co_u32 v13, vcc_lo, v8, v5 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_add3_u32 v0, v6, v7, v0 +; GFX10-NEXT: v_mad_u64_u32 v[5:6], s23, s5, v13, 0 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s21, s21, v12, v[3:4] +; GFX10-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX10-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, v10, v0, vcc_lo +; GFX10-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX10-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mul_hi_u32 v9, v11, v1 +; GFX10-NEXT: v_mov_b32_e32 v7, v6 +; GFX10-NEXT: v_mul_hi_u32 v16, v14, v5 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s20, s20, v11, v[2:3] +; GFX10-NEXT: v_mul_lo_u32 v3, v12, v1 +; GFX10-NEXT: v_mad_u64_u32 v[6:7], s5, s5, v14, v[7:8] +; GFX10-NEXT: v_mul_lo_u32 v7, v14, v5 +; GFX10-NEXT: v_mul_hi_u32 v1, v12, v1 +; GFX10-NEXT: v_mul_hi_u32 v8, v13, v5 +; GFX10-NEXT: v_mul_lo_u32 v15, v11, v2 +; GFX10-NEXT: v_mul_lo_u32 v17, v12, v2 +; GFX10-NEXT: v_mul_hi_u32 v18, v11, v2 +; GFX10-NEXT: v_mul_hi_u32 v2, v12, v2 +; GFX10-NEXT: v_mad_u64_u32 v[5:6], s5, s22, v13, v[6:7] +; GFX10-NEXT: v_add_co_u32 v3, s5, v3, v15 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v1, s5, v17, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s5 +; GFX10-NEXT: v_mul_lo_u32 v17, v13, v5 +; GFX10-NEXT: v_add_co_u32 v3, s5, v3, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v1, s5, v1, v18 +; GFX10-NEXT: v_mul_lo_u32 v18, v14, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v6, v3 +; GFX10-NEXT: v_add_co_u32 v7, s5, v7, v17 +; GFX10-NEXT: v_mul_hi_u32 v19, v13, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v15, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v15, s5, v18, v16 +; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v1, s5, v1, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v5, s5, v11, v16 -; GFX10-NEXT: v_mul_lo_u32 v16, v9, v0 +; GFX10-NEXT: v_add_co_u32 v7, s5, v7, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v8, s5, v15, v19 +; GFX10-NEXT: v_add3_u32 v2, v6, v3, v2 +; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v11, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v9, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s5 +; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v12, v2, vcc_lo +; GFX10-NEXT: v_mul_hi_u32 v5, v14, v5 +; GFX10-NEXT: v_add_co_u32 v3, s5, v8, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v16, v15 +; GFX10-NEXT: v_mul_lo_u32 v7, s1, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s5 +; GFX10-NEXT: v_mul_lo_u32 v11, s0, v2 +; GFX10-NEXT: v_mul_hi_u32 v9, s0, v1 +; GFX10-NEXT: v_mul_hi_u32 v1, s1, v1 +; GFX10-NEXT: v_mul_lo_u32 v12, s1, v2 +; GFX10-NEXT: v_add3_u32 v5, v6, v8, v5 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v13, v3 +; GFX10-NEXT: v_mul_hi_u32 v6, s0, v2 +; GFX10-NEXT: v_add_co_u32 v7, s5, v7, v11 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v14, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-NEXT: v_mul_hi_u32 v17, v8, v0 -; GFX10-NEXT: v_mul_hi_u32 v0, v9, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v14, v11 -; GFX10-NEXT: v_add_co_u32 v11, s5, v12, v15 +; GFX10-NEXT: v_add_co_u32 v1, s5, v12, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v2, s5, v16, v2 +; GFX10-NEXT: v_add_co_u32 v7, s5, v7, v9 +; GFX10-NEXT: v_mul_hi_u32 v8, s1, v2 +; GFX10-NEXT: v_mul_lo_u32 v2, s19, v3 +; GFX10-NEXT: v_mul_lo_u32 v14, s18, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v1, s5, v1, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s5 +; GFX10-NEXT: v_mul_hi_u32 v13, s18, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v7, v11, v7 +; GFX10-NEXT: v_mul_hi_u32 v3, s19, v3 +; GFX10-NEXT: v_add_co_u32 v11, s5, v2, v14 +; GFX10-NEXT: v_mul_lo_u32 v9, s19, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v12, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v7, s5, v1, v7 +; GFX10-NEXT: v_mul_hi_u32 v15, s18, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v3, s5, v5, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5 +; GFX10-NEXT: v_mul_hi_u32 v5, s19, v5 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s5, s6, v7, 0 +; GFX10-NEXT: v_add_co_u32 v3, s20, v9, v3 ; GFX10-NEXT: v_add_co_u32 v11, s5, v11, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v2, s5, v2, v17 -; GFX10-NEXT: v_add3_u32 v1, v4, v5, v1 -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v6, v3 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v12, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v1, vcc_lo -; GFX10-NEXT: v_mul_lo_u32 v6, s1, v3 -; GFX10-NEXT: v_add_co_u32 v2, s5, v2, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v14, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 -; GFX10-NEXT: v_mul_lo_u32 v11, s0, v1 -; GFX10-NEXT: v_mul_hi_u32 v7, s0, v3 -; GFX10-NEXT: v_mul_hi_u32 v3, s1, v3 -; GFX10-NEXT: v_mul_lo_u32 v12, s1, v1 -; GFX10-NEXT: v_add3_u32 v0, v5, v4, v0 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v8, v2 -; GFX10-NEXT: v_mul_hi_u32 v4, s0, v1 -; GFX10-NEXT: v_mul_hi_u32 v5, s1, v1 -; GFX10-NEXT: v_add_co_u32 v1, s5, v6, v11 -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v9, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v3, s5, v12, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v1, s5, v1, v7 -; GFX10-NEXT: v_mul_lo_u32 v0, s19, v2 -; GFX10-NEXT: v_mul_lo_u32 v12, s18, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v3, s5, v3, v4 -; GFX10-NEXT: v_mul_hi_u32 v9, s18, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 -; GFX10-NEXT: v_mul_hi_u32 v2, s19, v2 -; GFX10-NEXT: v_mul_lo_u32 v7, s19, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v6, v1 -; GFX10-NEXT: v_add_co_u32 v6, s5, v0, v12 -; GFX10-NEXT: v_mul_hi_u32 v13, s18, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v11, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v12, s5, v3, v1 -; GFX10-NEXT: v_add_co_u32 v2, s20, v7, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s5 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, s6, v12, 0 -; GFX10-NEXT: v_add_co_u32 v6, s5, v6, v9 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v9, s5, v2, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s20 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5 -; GFX10-NEXT: v_add3_u32 v4, v4, v7, v5 -; GFX10-NEXT: v_add_nc_u32_e32 v6, v11, v6 -; GFX10-NEXT: v_mul_hi_u32 v5, s19, v8 -; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v12, 1 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v2 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s5, s6, v4, v[1:2] -; GFX10-NEXT: v_add_co_u32 v6, s5, v9, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v4, vcc_lo +; GFX10-NEXT: v_add_co_u32 v13, s5, v3, v15 +; GFX10-NEXT: v_add3_u32 v6, v6, v14, v8 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s5 +; GFX10-NEXT: v_add_nc_u32_e32 v8, v12, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s20 ; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v7, 1 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s5, s7, v12, v[1:2] -; GFX10-NEXT: v_add3_u32 v5, v3, v9, v5 -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s5, s2, v6, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v8, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, s0, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v9, s1, v1 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v15, s0, s1, v1, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v0, v3 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v9, vcc_lo, s7, v9, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v3, vcc_lo, v14, s6 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v16, s0, 0, v9, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v14 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v9, vcc_lo, s7, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v16 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s5, s6, v6, v[3:4] +; GFX10-NEXT: v_add_co_u32 v8, s5, v13, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 +; GFX10-NEXT: v_add_nc_u32_e32 v9, v9, v15 +; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, 0, v6, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s5, s7, v7, v[2:3] +; GFX10-NEXT: v_add_co_u32 v13, vcc_lo, v11, 1 +; GFX10-NEXT: v_add3_u32 v5, v9, v4, v5 +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s5, s2, v8, 0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, 0, v12, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v9, s1, v2 +; GFX10-NEXT: v_sub_co_u32 v15, vcc_lo, s0, v1 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v16, s0, s1, v2, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v17, vcc_lo, s7, v9, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v9, v4 +; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v15, s6 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, s0, 0, v17, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v15 ; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v15 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, -1, s0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v5, v[0:1] +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v18 +; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, -1, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v16 +; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, -1, s0 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s2, v5, v[9:10] +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v18 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, s7, v17, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v21, v20, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v16 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v19, v18, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v15 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v20, v17, s0 -; GFX10-NEXT: v_sub_co_u32 v1, s0, v3, s6 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v9, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s3, v6, v[0:1] -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v17 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v13, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v12, v7, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v16, v9, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, s18, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v8, s0 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v8, s1, s19, v0, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v0, s19, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v14, v3, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v15, v7, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v8 -; GFX10-NEXT: v_xor_b32_e32 v1, s16, v1 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v0, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 -; GFX10-NEXT: v_xor_b32_e32 v4, s17, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, s0 -; GFX10-NEXT: v_xor_b32_e32 v3, s4, v3 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v22, v19, s0 +; GFX10-NEXT: v_sub_co_u32 v2, s0, v4, s6 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v10, s0, 0, v10, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s1, s3, v8, v[1:2] +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v12, v14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v11, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v18, v10, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v3, vcc_lo, s18, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v9, s0 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v9, s1, s19, v1, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v1, s19, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v15, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v16, v7, s0 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v9 +; GFX10-NEXT: v_xor_b32_e32 v2, s16, v2 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v1, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v3 +; GFX10-NEXT: v_xor_b32_e32 v6, s17, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0 +; GFX10-NEXT: v_xor_b32_e32 v4, s4, v4 ; GFX10-NEXT: v_xor_b32_e32 v7, s4, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v13, vcc_lo, v2, s2 +; GFX10-NEXT: v_sub_co_u32 v13, vcc_lo, v3, s2 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v14, s0, 0, v11, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v0, s0, v1, s16 -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s0, s17, v4, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v8 +; GFX10-NEXT: v_sub_co_u32 v1, s0, v2, s16 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v2, s0, s17, v6, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v9 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v11, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v9, v12, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v12, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0 -; GFX10-NEXT: v_add_co_u32 v15, s0, v6, 1 +; GFX10-NEXT: v_add_co_u32 v15, s0, v8, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v16, s0, 0, v5, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v12, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v12, s0 ; GFX10-NEXT: v_add_co_u32 v12, s0, v15, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v17, s0, 0, v16, s0 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 -; GFX10-NEXT: v_sub_co_u32 v9, s0, v13, s2 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 +; GFX10-NEXT: v_sub_co_u32 v10, s0, v13, s2 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v11, s0, 0, v11, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v12, v15, v12, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v4 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v4, v13, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v9, v14, v11, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v12, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v13, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v10, v14, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v12, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v11, v5, v15, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v9, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v10, s0 ; GFX10-NEXT: s_xor_b64 s[0:1], s[8:9], s[10:11] -; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v3, s4 -; GFX10-NEXT: v_xor_b32_e32 v3, s0, v6 -; GFX10-NEXT: v_xor_b32_e32 v6, s1, v11 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s4, v7, vcc_lo -; GFX10-NEXT: v_xor_b32_e32 v7, s8, v2 -; GFX10-NEXT: v_xor_b32_e32 v8, s8, v8 -; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v3, s0 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v6, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v7, s8 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s8, v8, vcc_lo -; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[12:13] -; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[14:15] +; GFX10-NEXT: v_sub_co_u32 v5, vcc_lo, v4, s4 +; GFX10-NEXT: v_xor_b32_e32 v4, s0, v8 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s4, v7, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v7, s1, v11 +; GFX10-NEXT: v_xor_b32_e32 v8, s8, v3 +; GFX10-NEXT: v_xor_b32_e32 v9, s8, v9 +; GFX10-NEXT: v_sub_co_u32 v3, vcc_lo, v4, s0 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v4, vcc_lo, s1, v7, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v7, vcc_lo, v8, s8 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, s8, v9, vcc_lo +; GFX10-NEXT: global_store_dwordx4 v0, v[1:4], s[12:13] +; GFX10-NEXT: global_store_dwordx4 v0, v[5:8], s[14:15] ; GFX10-NEXT: s_endpgm %div = sdiv <2 x i64> %x, %y store <2 x i64> %div, ptr addrspace(1) %out0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index 19dc20c510041..e77a514a06857 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -41,8 +41,10 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v6 ; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 ; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v6 +; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4] +; CHECK-NEXT: v_mov_b32_e32 v6, v3 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7] ; CHECK-NEXT: v_mul_lo_u32 v3, v11, v2 ; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] ; CHECK-NEXT: v_mul_hi_u32 v7, v8, v2 @@ -68,61 +70,65 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 ; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v3, vcc ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4] +; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7 +; CHECK-NEXT: v_mov_b32_e32 v6, v3 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7] ; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v5 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v9 ; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] ; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v9, vcc -; CHECK-NEXT: v_xor_b32_e32 v5, v3, v9 +; CHECK-NEXT: v_xor_b32_e32 v7, v3, v9 ; CHECK-NEXT: v_mul_lo_u32 v3, v11, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v5, v8, v6 ; CHECK-NEXT: v_xor_b32_e32 v10, v4, v9 ; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2 ; CHECK-NEXT: v_mul_hi_u32 v2, v11, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_mul_lo_u32 v4, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_mul_hi_u32 v5, v8, v6 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v6 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2 ; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc ; CHECK-NEXT: v_mul_lo_u32 v4, v10, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v7, v5, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v7, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v7, v2 ; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 ; CHECK-NEXT: v_mul_hi_u32 v8, v10, v3 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v10, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_mul_hi_u32 v6, v5, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_mul_lo_u32 v6, v10, v3 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_mul_hi_u32 v5, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v2, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v7, 0 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v2, v4 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v6, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4] -; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v7, v[3:4] +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v4 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_mov_b32_e32 v4, v3 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v8, v[4:5] +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v6, v[3:4] ; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc ; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3 ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 @@ -214,60 +220,64 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; CHECK-NEXT: v_trunc_f32_e32 v2, v1 ; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v0 -; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v2 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2] -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2] -; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 -; CHECK-NEXT: v_mul_lo_u32 v6, v3, v1 -; CHECK-NEXT: v_mul_lo_u32 v7, v4, v1 -; CHECK-NEXT: v_mul_hi_u32 v8, v3, v1 -; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1 +; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v0 +; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v4, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, v1 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v5, v[2:3] +; CHECK-NEXT: v_mul_hi_u32 v3, v4, v0 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v4, v[1:2] +; CHECK-NEXT: v_mul_lo_u32 v2, v5, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0 +; CHECK-NEXT: v_mul_lo_u32 v6, v4, v1 +; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1 +; CHECK-NEXT: v_mul_hi_u32 v8, v4, v1 +; CHECK-NEXT: v_mul_hi_u32 v1, v5, v1 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v0 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2] -; CHECK-NEXT: v_mul_hi_u32 v6, v3, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2] -; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 -; CHECK-NEXT: v_mul_lo_u32 v5, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v0 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v4, 0 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CHECK-NEXT: v_mov_b32_e32 v2, v1 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v5, v[2:3] +; CHECK-NEXT: v_mul_hi_u32 v6, v4, v0 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v4, v[1:2] +; CHECK-NEXT: v_mul_lo_u32 v2, v5, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0 +; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v4, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v1 +; CHECK-NEXT: v_mul_lo_u32 v6, v5, v1 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; CHECK-NEXT: v_mul_hi_u32 v1, v5, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc ; CHECK-NEXT: v_mul_lo_u32 v2, s11, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, s10, v1 ; CHECK-NEXT: v_mul_hi_u32 v4, s10, v0 @@ -289,12 +299,14 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v2, v[1:2] -; CHECK-NEXT: v_mov_b32_e32 v5, s11 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2] +; CHECK-NEXT: v_mov_b32_e32 v2, v1 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v5, v[2:3] +; CHECK-NEXT: v_mov_b32_e32 v5, s11 ; CHECK-NEXT: v_mov_b32_e32 v3, s9 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2] ; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v5, v1, vcc ; CHECK-NEXT: v_sub_i32_e64 v1, s[0:1], s11, v1 ; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc @@ -380,47 +392,49 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v4 ; GISEL-NEXT: v_trunc_f32_e32 v11, v9 ; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v11 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v12, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v10 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v15, v9 -; GISEL-NEXT: v_mul_hi_u32 v16, v12, v9 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11] +; GISEL-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0 +; GISEL-NEXT: v_mov_b32_e32 v11, v10 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[11:12] +; GISEL-NEXT: v_mul_hi_u32 v16, v4, v9 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v4, v[10:11] +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 -; GISEL-NEXT: v_mul_lo_u32 v11, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v10 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v16, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; GISEL-NEXT: v_mul_hi_u32 v11, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v12, v4, v10 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 +; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 -; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v4 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v15, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v12, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v10 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5] +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v4, v9 +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v15, v10, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v16, 0 +; GISEL-NEXT: ; implicit-def: $vgpr11_vgpr12 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11] +; GISEL-NEXT: v_mov_b32_e32 v11, v10 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[11:12] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v13, v0, v4 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v16, v[10:11] +; GISEL-NEXT: v_xor_b32_e32 v12, v0, v4 ; GISEL-NEXT: v_mul_lo_u32 v0, v15, v9 -; GISEL-NEXT: v_mul_lo_u32 v11, v12, v10 -; GISEL-NEXT: v_xor_b32_e32 v14, v1, v4 -; GISEL-NEXT: v_mul_hi_u32 v1, v12, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, v16, v10 +; GISEL-NEXT: v_xor_b32_e32 v13, v1, v4 +; GISEL-NEXT: v_mul_hi_u32 v1, v16, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc @@ -428,7 +442,7 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v1, v15, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v11, v12, v10 +; GISEL-NEXT: v_mul_hi_u32 v11, v16, v10 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 @@ -439,194 +453,200 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v15, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v10, v13, v1 -; GISEL-NEXT: v_mul_hi_u32 v11, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v13, v0 +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v1 +; GISEL-NEXT: v_mul_hi_u32 v11, v12, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v14, v1 +; GISEL-NEXT: v_mul_lo_u32 v11, v13, v1 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_mul_hi_u32 v10, v13, v1 +; GISEL-NEXT: v_mul_hi_u32 v10, v12, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v1, v14, v1 +; GISEL-NEXT: v_mul_hi_u32 v1, v13, v1 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v1, v0 -; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v10, vcc -; GISEL-NEXT: v_xor_b32_e32 v6, v6, v10 -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v10 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v7 +; GISEL-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v1, v0 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v6, v6, v9 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v9 ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, 0 -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 -; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v15 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v9, v[1:2] -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v12 -; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc +; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v16, v7 +; GISEL-NEXT: v_mov_b32_e32 v9, v1 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v14, v[9:10] +; GISEL-NEXT: v_mac_f32_e32 v15, 0x4f800000, v16 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v15 ; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v8, v11, v[9:10] +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 ; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v12, v10 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v12 +; GISEL-NEXT: v_trunc_f32_e32 v14, v10 +; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v14 ; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v1 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v13, v0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v14 +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc ; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v12, v0 +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GISEL-NEXT: v_mov_b32_e32 v0, v11 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v12, v[0:1] -; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], v14, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v14, v[0:1] +; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], v13, v9, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v15, v[0:1] -; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v14, v9 -; GISEL-NEXT: v_mul_lo_u32 v9, v12, v10 -; GISEL-NEXT: v_mul_lo_u32 v14, v15, v0 +; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v13, v9 +; GISEL-NEXT: v_mul_lo_u32 v9, v14, v10 +; GISEL-NEXT: v_mul_lo_u32 v13, v15, v0 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v11, v8 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v15, v10 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v15, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v10 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 +; GISEL-NEXT: v_mul_hi_u32 v10, v14, v10 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v13, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[6:7] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v11, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[6:7] -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v13, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v13, s[6:7] +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v12, v5 ; GISEL-NEXT: v_subbrev_u32_e64 v19, s[6:7], 0, v1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v14, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v13, v5 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v19, v8 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v8 -; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v14, v5 +; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v13, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7] ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v8, v19, v1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v14, v12, v0 +; GISEL-NEXT: v_mul_lo_u32 v13, v14, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v18, v1 ; GISEL-NEXT: v_mul_hi_u32 v18, v15, v0 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v1 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v0, vcc +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v14, v0, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v12, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4 +; GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v12, v5, vcc +; GISEL-NEXT: v_mov_b32_e32 v8, v1 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v13, v[8:9] +; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v12 ; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v10, v[8:9] -; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v13, v2, v5 -; GISEL-NEXT: v_mul_lo_u32 v2, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8 -; GISEL-NEXT: v_xor_b32_e32 v14, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v8 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v12, vcc +; GISEL-NEXT: v_xor_b32_e32 v14, v1, v12 +; GISEL-NEXT: v_mul_lo_u32 v1, v13, v0 +; GISEL-NEXT: v_mul_lo_u32 v3, v10, v8 +; GISEL-NEXT: v_xor_b32_e32 v15, v2, v12 +; GISEL-NEXT: v_mul_hi_u32 v2, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v8, v12, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v2, v13, v8 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GISEL-NEXT: v_mul_hi_u32 v3, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GISEL-NEXT: v_mul_hi_u32 v3, v13, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v12, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v13, v2 -; GISEL-NEXT: v_mul_hi_u32 v9, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v13, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v2, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v1 +; GISEL-NEXT: v_mul_hi_u32 v8, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 ; GISEL-NEXT: v_xor_b32_e32 v10, v11, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v14, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; GISEL-NEXT: v_mul_hi_u32 v8, v13, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v15, v1 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GISEL-NEXT: v_mul_hi_u32 v3, v14, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v9, v14, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v3 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v2 +; GISEL-NEXT: v_mul_hi_u32 v1, v15, v1 ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v1, v0 +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GISEL-NEXT: v_xor_b32_e32 v5, v5, v4 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 ; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v8, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v5, v4 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v11, v[8:9] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v13, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v7 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v2, v6 -; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v2, v6 +; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v6 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v7 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v7 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v5 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v5 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64: @@ -651,100 +671,106 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v1, v2, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v0 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v0 +; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; CGP-NEXT: v_trunc_f32_e32 v4, v3 ; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v14, v4 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4] -; CGP-NEXT: v_mul_hi_u32 v15, v5, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4] -; CGP-NEXT: v_mul_lo_u32 v4, v14, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v14, v2 -; CGP-NEXT: v_mul_lo_u32 v16, v5, v3 -; CGP-NEXT: v_mul_lo_u32 v17, v14, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v12, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v15, v4 +; CGP-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v12, 0 +; CGP-NEXT: v_mov_b32_e32 v4, v3 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v15, v[4:5] +; CGP-NEXT: v_mul_hi_u32 v5, v12, v2 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v14, v12, v[3:4] +; CGP-NEXT: v_mul_lo_u32 v4, v15, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v15, v2 +; CGP-NEXT: v_mul_lo_u32 v16, v12, v3 +; CGP-NEXT: v_mul_lo_u32 v17, v15, v3 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15 -; CGP-NEXT: v_mul_hi_u32 v15, v5, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v12, v3 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v16, v4 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v17, v2 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5 +; CGP-NEXT: v_mul_hi_u32 v3, v15, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v2 -; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v3, vcc -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4] -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; CGP-NEXT: v_mul_hi_u32 v15, v5, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4] -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v12 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v4, v12 -; CGP-NEXT: v_mul_lo_u32 v4, v14, v2 -; CGP-NEXT: v_mul_lo_u32 v13, v5, v3 -; CGP-NEXT: v_mul_hi_u32 v2, v14, v2 -; CGP-NEXT: v_xor_b32_e32 v10, v10, v12 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v2 +; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v3, vcc +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v12, 0 +; CGP-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CGP-NEXT: v_mov_b32_e32 v4, v3 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v15, v[4:5] +; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v11 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v14, v12, v[3:4] +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v13 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v11, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v10, v4, v13 +; CGP-NEXT: v_mul_lo_u32 v4, v15, v2 +; CGP-NEXT: v_mul_lo_u32 v11, v12, v3 +; CGP-NEXT: v_xor_b32_e32 v14, v5, v13 +; CGP-NEXT: v_mul_hi_u32 v5, v12, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v15, v2 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v14, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_mul_hi_u32 v13, v5, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v15, v2 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 +; CGP-NEXT: v_mul_lo_u32 v5, v15, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; CGP-NEXT: v_mul_hi_u32 v11, v12, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; CGP-NEXT: v_mul_hi_u32 v3, v15, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v14, v3, vcc -; CGP-NEXT: v_mul_lo_u32 v4, v10, v2 -; CGP-NEXT: v_mul_lo_u32 v5, v11, v3 -; CGP-NEXT: v_mul_hi_u32 v13, v11, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v10, v2 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v12, v2 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v15, v3, vcc +; CGP-NEXT: v_mul_lo_u32 v4, v14, v2 +; CGP-NEXT: v_mul_lo_u32 v5, v10, v3 +; CGP-NEXT: v_mul_hi_u32 v11, v10, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v14, v2 +; CGP-NEXT: v_mul_hi_u32 v12, v14, v3 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v10, v3 +; CGP-NEXT: v_mul_lo_u32 v11, v14, v3 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_mul_hi_u32 v5, v11, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v5, v10, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v11, v2 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v2, v4 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v13, 0 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v2, v4 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v11, 0 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v13, v[3:4] -; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v4 +; CGP-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v10, v2 +; CGP-NEXT: v_mov_b32_e32 v4, v3 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v12, v[4:5] +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v11, v[3:4] +; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc +; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 @@ -755,24 +781,24 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_sub_i32_e32 v10, vcc, v2, v0 ; CGP-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v3, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v0 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v1 ; CGP-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v10, v0 -; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; CGP-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v12 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v12 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v13 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v13 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v13 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v13, vcc ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr10 ; CGP-NEXT: .LBB2_2: ; %Flow1 @@ -820,100 +846,106 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v3, v4, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v5, v3 -; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v6, v5 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v12, v6 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6] -; CGP-NEXT: v_mul_hi_u32 v13, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6] -; CGP-NEXT: v_mul_lo_u32 v6, v12, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v5 -; CGP-NEXT: v_mul_lo_u32 v15, v12, v5 +; CGP-NEXT: v_cvt_u32_f32_e32 v10, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v13, v6 +; CGP-NEXT: ; implicit-def: $vgpr6_vgpr7 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, 0 +; CGP-NEXT: v_mov_b32_e32 v6, v5 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v13, v[6:7] +; CGP-NEXT: v_mul_hi_u32 v7, v10, v4 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6] +; CGP-NEXT: v_mul_lo_u32 v6, v13, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v13, v4 +; CGP-NEXT: v_mul_lo_u32 v14, v10, v5 +; CGP-NEXT: v_mul_lo_u32 v15, v13, v5 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v10, v5 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v14, v6 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v5, v12, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v14, v7 +; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v13, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v4 -; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v5, vcc -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6] -; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6] -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v10 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v10, vcc -; CGP-NEXT: v_xor_b32_e32 v9, v6, v10 -; CGP-NEXT: v_mul_lo_u32 v6, v12, v4 -; CGP-NEXT: v_mul_lo_u32 v11, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 -; CGP-NEXT: v_xor_b32_e32 v8, v8, v10 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v4 +; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v5, vcc +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, 0 +; CGP-NEXT: ; implicit-def: $vgpr6_vgpr7 +; CGP-NEXT: v_mov_b32_e32 v6, v5 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v13, v[6:7] +; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v9 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6] +; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v11 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v9, v11, vcc +; CGP-NEXT: v_xor_b32_e32 v8, v6, v11 +; CGP-NEXT: v_mul_lo_u32 v6, v13, v4 +; CGP-NEXT: v_mul_lo_u32 v9, v10, v5 +; CGP-NEXT: v_xor_b32_e32 v12, v7, v11 +; CGP-NEXT: v_mul_hi_u32 v7, v10, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v13, v4 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v12, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6 -; CGP-NEXT: v_mul_hi_u32 v11, v7, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_mul_hi_u32 v5, v12, v5 +; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CGP-NEXT: v_mul_hi_u32 v9, v10, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v12, v5, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v7, v9, v5 -; CGP-NEXT: v_mul_hi_u32 v11, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v8, v4 -; CGP-NEXT: v_mul_hi_u32 v12, v8, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v13, v5, vcc +; CGP-NEXT: v_mul_lo_u32 v6, v12, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v12, v5 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v9, v12, v5 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_mul_hi_u32 v7, v9, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v7, v8, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v6 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v11, 0 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v4, v6 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6] -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v11, v[5:6] -; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v8, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v6 +; CGP-NEXT: ; implicit-def: $vgpr6_vgpr7 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_mov_b32_e32 v6, v5 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v10, v[6:7] +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[5:6] +; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v12, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v12, v5 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 @@ -924,24 +956,24 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_sub_i32_e32 v8, vcc, v4, v2 ; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 ; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 -; CGP-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v11 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v11 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v11 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v11, vcc ; CGP-NEXT: ; implicit-def: $vgpr6 ; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -977,77 +1009,81 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000 ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v6, 0xfffff000 +; CHECK-NEXT: v_mov_b32_e32 v7, 0xfffff000 ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v4, v3 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 +; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v4 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v6, 0 +; CHECK-NEXT: v_mov_b32_e32 v4, v3 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v8, v[4:5] +; CHECK-NEXT: v_mul_hi_u32 v5, v6, v2 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v6, v[3:4] +; CHECK-NEXT: v_mul_lo_u32 v4, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v8, v2 +; CHECK-NEXT: v_mul_lo_u32 v9, v6, v3 +; CHECK-NEXT: v_mul_lo_u32 v10, v8, v3 +; CHECK-NEXT: v_mul_hi_u32 v11, v6, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v8, v3 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2 -; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc -; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6 -; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3 -; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6 -; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v2 +; CHECK-NEXT: v_addc_u32_e32 v8, vcc, v8, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v6, 0 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: v_mov_b32_e32 v4, v3 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v8, v[4:5] +; CHECK-NEXT: v_ashrrev_i32_e32 v5, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v6, v[3:4] +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc +; CHECK-NEXT: v_xor_b32_e32 v4, v0, v5 +; CHECK-NEXT: v_mul_lo_u32 v0, v8, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, v6, v3 +; CHECK-NEXT: v_xor_b32_e32 v9, v1, v5 +; CHECK-NEXT: v_mul_hi_u32 v1, v6, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v8, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3 +; CHECK-NEXT: v_mul_lo_u32 v1, v8, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; CHECK-NEXT: v_mul_hi_u32 v7, v6, v3 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; CHECK-NEXT: v_mul_hi_u32 v3, v8, v3 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v8, v1, vcc ; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1 ; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0 ; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0 -; CHECK-NEXT: v_mov_b32_e32 v5, 0x1000 +; CHECK-NEXT: v_mov_b32_e32 v6, 0x1000 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 @@ -1062,37 +1098,39 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: v_mul_hi_u32 v7, v9, v1 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2] +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v2 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 +; CHECK-NEXT: v_mov_b32_e32 v2, v1 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v7, v[2:3] ; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc ; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v5 +; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v6 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v5 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v6 ; CHECK-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc -; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v4, v5 +; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 ; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v3, -1, v3, s[4:5] ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v5 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v5 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i64 %num, 4096 ret i64 %result @@ -1105,6 +1143,7 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 ; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 ; GISEL-NEXT: v_mov_b32_e32 v6, 0xfffff000 +; GISEL-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 @@ -1141,42 +1180,43 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v7, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0 +; GISEL-NEXT: ; implicit-def: $vgpr15_vgpr16 ; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc -; GISEL-NEXT: v_mov_b32_e32 v4, v14 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15] +; GISEL-NEXT: v_mov_b32_e32 v15, v14 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[15:16] +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v4, v[14:15] ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v9, v16, v14 +; GISEL-NEXT: v_mul_lo_u32 v15, v4, v14 ; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v16, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v9, v17, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 +; GISEL-NEXT: v_mul_hi_u32 v15, v4, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v15, v17, v14 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v4, v14 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v9 ; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc ; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v16 +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v14 ; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v15, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 ; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 @@ -1200,23 +1240,26 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v13 +; GISEL-NEXT: ; implicit-def: $vgpr13_vgpr14 +; GISEL-NEXT: v_mov_b32_e32 v13, v1 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[13:14] ; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 ; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 ; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 @@ -1242,95 +1285,99 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v1 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v1 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v15, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v11, v9 -; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v14, v16, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v2, v11 -; GISEL-NEXT: v_mul_lo_u32 v2, v8, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v7, v5 -; GISEL-NEXT: v_xor_b32_e32 v13, v3, v11 -; GISEL-NEXT: v_mul_hi_u32 v3, v7, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v14, v16, vcc +; GISEL-NEXT: v_xor_b32_e32 v13, v7, v9 +; GISEL-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v15, v5, vcc +; GISEL-NEXT: v_mov_b32_e32 v7, v1 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v11, v[7:8] +; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[5:6] +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_xor_b32_e32 v8, v2, v7 +; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 +; GISEL-NEXT: v_xor_b32_e32 v1, v12, v9 +; GISEL-NEXT: v_xor_b32_e32 v12, v3, v7 +; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v8, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5 +; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 -; GISEL-NEXT: v_mul_lo_u32 v6, v12, v2 -; GISEL-NEXT: v_xor_b32_e32 v10, v10, v9 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v3 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, v12, v3 +; GISEL-NEXT: v_mul_lo_u32 v6, v8, v2 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v13, v9 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; GISEL-NEXT: v_mul_hi_u32 v9, v8, v3 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v13, v2 -; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 +; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v3, v12, v3 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_mul_hi_u32 v6, v12, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v8, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v7, v13, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v12, v2 ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v5 +; GISEL-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_mov_b32_e32 v5, v3 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[5:6] +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v12, v5 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v2, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v7, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v8, -1, v8, vcc -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v9, -1, v9, vcc +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v5, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v6, -1, v6, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v11 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v11 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v11 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v11, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v7 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_pow2k_denom: @@ -1339,6 +1386,7 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 ; CGP-NEXT: v_mov_b32_e32 v6, 0xfffff000 +; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 @@ -1372,39 +1420,40 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0 +; CGP-NEXT: ; implicit-def: $vgpr15_vgpr16 ; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc -; CGP-NEXT: v_mov_b32_e32 v4, v14 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v17, v13 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v16, v13 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v9, v17, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v17, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v15, v16, v14 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; CGP-NEXT: v_mov_b32_e32 v15, v14 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[15:16] +; CGP-NEXT: v_mul_lo_u32 v9, v17, v13 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v4, v[14:15] +; CGP-NEXT: v_mul_lo_u32 v15, v4, v14 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15 +; CGP-NEXT: v_mul_hi_u32 v15, v4, v13 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v13, v17, v13 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v15, v17, v14 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v4, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v9 ; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc ; CGP-NEXT: v_xor_b32_e32 v18, v0, v9 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v17, v14 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v13, v16 +; CGP-NEXT: v_mul_hi_u32 v13, v17, v14 ; CGP-NEXT: v_xor_b32_e32 v19, v1, v9 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc ; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 ; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 @@ -1428,24 +1477,27 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v13 +; CGP-NEXT: ; implicit-def: $vgpr13_vgpr14 +; CGP-NEXT: v_mov_b32_e32 v13, v1 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[13:14] ; CGP-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 ; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc -; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 -; CGP-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 +; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; CGP-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 +; CGP-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc +; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 ; CGP-NEXT: v_mov_b32_e32 v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 ; CGP-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1] ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1] ; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v18, vcc ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 ; CGP-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc @@ -1470,95 +1522,99 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1 +; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; CGP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v16, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v17, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v10, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] -; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v2, v10 -; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v7, v5 -; CGP-NEXT: v_xor_b32_e32 v13, v3, v10 -; CGP-NEXT: v_mul_hi_u32 v3, v7, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_mov_b32_e32 v7, v1 +; CGP-NEXT: v_xor_b32_e32 v13, v5, v9 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v11, v[7:8] +; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v3 +; CGP-NEXT: v_cndmask_b32_e32 v12, v15, v17, vcc +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[5:6] +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc +; CGP-NEXT: v_xor_b32_e32 v8, v2, v7 +; CGP-NEXT: v_mul_lo_u32 v2, v11, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v10, v5 +; CGP-NEXT: v_xor_b32_e32 v1, v12, v9 +; CGP-NEXT: v_xor_b32_e32 v12, v3, v7 +; CGP-NEXT: v_mul_hi_u32 v3, v10, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v11, v5 +; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v11, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 -; CGP-NEXT: v_mul_hi_u32 v7, v12, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v12, v3 +; CGP-NEXT: v_mul_lo_u32 v6, v8, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9 ; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v8, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v13, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 +; CGP-NEXT: v_mul_lo_u32 v9, v12, v2 +; CGP-NEXT: v_mul_hi_u32 v3, v12, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_mul_hi_u32 v6, v12, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v6, v8, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v7, v13, v2 +; CGP-NEXT: v_mul_hi_u32 v9, v12, v2 ; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v5 +; CGP-NEXT: ; implicit-def: $vgpr5_vgpr6 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 +; CGP-NEXT: v_mov_b32_e32 v5, v3 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[5:6] +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v12, v5 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v2, v4 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v4 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 -; CGP-NEXT: v_cndmask_b32_e32 v8, -1, v8, vcc -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_cndmask_b32_e32 v9, -1, v9, vcc +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 -; CGP-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v5, vcc +; CGP-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v5, vcc ; CGP-NEXT: v_cndmask_b32_e64 v6, -1, v6, s[4:5] -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; CGP-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v7 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v7 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i64> %num, ret <2 x i64> %result @@ -1570,77 +1626,81 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v6, 0xffed2705 +; CHECK-NEXT: v_mov_b32_e32 v7, 0xffed2705 ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v4, v3 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 +; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v4 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v6, 0 +; CHECK-NEXT: v_mov_b32_e32 v4, v3 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v8, v[4:5] +; CHECK-NEXT: v_mul_hi_u32 v5, v6, v2 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v6, v[3:4] +; CHECK-NEXT: v_mul_lo_u32 v4, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v8, v2 +; CHECK-NEXT: v_mul_lo_u32 v9, v6, v3 +; CHECK-NEXT: v_mul_lo_u32 v10, v8, v3 +; CHECK-NEXT: v_mul_hi_u32 v11, v6, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v8, v3 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2 -; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc -; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6 -; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3 -; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6 -; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v2 +; CHECK-NEXT: v_addc_u32_e32 v8, vcc, v8, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v6, 0 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: v_mov_b32_e32 v4, v3 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v8, v[4:5] +; CHECK-NEXT: v_ashrrev_i32_e32 v5, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v6, v[3:4] +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc +; CHECK-NEXT: v_xor_b32_e32 v4, v0, v5 +; CHECK-NEXT: v_mul_lo_u32 v0, v8, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, v6, v3 +; CHECK-NEXT: v_xor_b32_e32 v9, v1, v5 +; CHECK-NEXT: v_mul_hi_u32 v1, v6, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v8, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3 +; CHECK-NEXT: v_mul_lo_u32 v1, v8, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; CHECK-NEXT: v_mul_hi_u32 v7, v6, v3 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; CHECK-NEXT: v_mul_hi_u32 v3, v8, v3 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v8, v1, vcc ; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1 ; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0 ; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0 -; CHECK-NEXT: v_mov_b32_e32 v5, 0x12d8fb +; CHECK-NEXT: v_mov_b32_e32 v6, 0x12d8fb ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 @@ -1655,37 +1715,39 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: v_mul_hi_u32 v7, v9, v1 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2] +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v2 +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 +; CHECK-NEXT: v_mov_b32_e32 v2, v1 +; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v7, v[2:3] ; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc ; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v5 +; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v6 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v5 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v6 ; CHECK-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc -; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v4, v5 +; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 ; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v3, -1, v3, s[4:5] ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v5 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v5 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i64 %num, 1235195 ret i64 %result @@ -1698,6 +1760,7 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb ; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 ; GISEL-NEXT: v_mov_b32_e32 v6, 0xffed2705 +; GISEL-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 @@ -1734,42 +1797,43 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v7, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0 +; GISEL-NEXT: ; implicit-def: $vgpr15_vgpr16 ; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc -; GISEL-NEXT: v_mov_b32_e32 v4, v14 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15] +; GISEL-NEXT: v_mov_b32_e32 v15, v14 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[15:16] +; GISEL-NEXT: v_mul_lo_u32 v9, v17, v13 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v4, v[14:15] ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v9, v16, v14 +; GISEL-NEXT: v_mul_lo_u32 v15, v4, v14 ; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v16, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v9, v17, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 +; GISEL-NEXT: v_mul_hi_u32 v15, v4, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v13 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v15, v17, v14 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v4, v14 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v9 ; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc ; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v16 +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v14 ; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v15, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 ; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 @@ -1793,23 +1857,26 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v13 +; GISEL-NEXT: ; implicit-def: $vgpr13_vgpr14 +; GISEL-NEXT: v_mov_b32_e32 v13, v1 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[13:14] ; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 ; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 ; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 @@ -1835,95 +1902,99 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v1 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v1 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v15, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v11, v9 -; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v14, v16, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v2, v11 -; GISEL-NEXT: v_mul_lo_u32 v2, v8, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v7, v5 -; GISEL-NEXT: v_xor_b32_e32 v13, v3, v11 -; GISEL-NEXT: v_mul_hi_u32 v3, v7, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v14, v16, vcc +; GISEL-NEXT: v_xor_b32_e32 v13, v7, v9 +; GISEL-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v15, v5, vcc +; GISEL-NEXT: v_mov_b32_e32 v7, v1 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v11, v[7:8] +; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[5:6] +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_xor_b32_e32 v8, v2, v7 +; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 +; GISEL-NEXT: v_xor_b32_e32 v1, v12, v9 +; GISEL-NEXT: v_xor_b32_e32 v12, v3, v7 +; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v8, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5 +; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 -; GISEL-NEXT: v_mul_lo_u32 v6, v12, v2 -; GISEL-NEXT: v_xor_b32_e32 v10, v10, v9 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v3 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, v12, v3 +; GISEL-NEXT: v_mul_lo_u32 v6, v8, v2 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v13, v9 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; GISEL-NEXT: v_mul_hi_u32 v9, v8, v3 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v13, v2 -; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 +; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v3, v12, v3 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_mul_hi_u32 v6, v12, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v6, v8, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v7, v13, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v12, v2 ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v5 +; GISEL-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_mov_b32_e32 v5, v3 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[5:6] +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v12, v5 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v2, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v7, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v8, -1, v8, vcc -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v9, -1, v9, vcc +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v5, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v6, -1, v6, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v11 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v11 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v11 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v11, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v7 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_oddk_denom: @@ -1932,6 +2003,7 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 ; CGP-NEXT: v_mov_b32_e32 v6, 0xffed2705 +; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 @@ -1965,39 +2037,40 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v4, 0 +; CGP-NEXT: ; implicit-def: $vgpr15_vgpr16 ; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc -; CGP-NEXT: v_mov_b32_e32 v4, v14 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v17, v13 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v16, v13 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v9, v17, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v17, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v15, v16, v14 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; CGP-NEXT: v_mov_b32_e32 v15, v14 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[15:16] +; CGP-NEXT: v_mul_lo_u32 v9, v17, v13 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v4, v[14:15] +; CGP-NEXT: v_mul_lo_u32 v15, v4, v14 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15 +; CGP-NEXT: v_mul_hi_u32 v15, v4, v13 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v13, v17, v13 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v15, v17, v14 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v4, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 +; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v9 ; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc ; CGP-NEXT: v_xor_b32_e32 v18, v0, v9 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v17, v14 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v13, v16 +; CGP-NEXT: v_mul_hi_u32 v13, v17, v14 ; CGP-NEXT: v_xor_b32_e32 v19, v1, v9 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc ; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 ; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 @@ -2021,24 +2094,27 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v13 +; CGP-NEXT: ; implicit-def: $vgpr13_vgpr14 +; CGP-NEXT: v_mov_b32_e32 v13, v1 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[13:14] ; CGP-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 ; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc -; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 -; CGP-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 +; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; CGP-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 +; CGP-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc +; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 ; CGP-NEXT: v_mov_b32_e32 v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 ; CGP-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1] ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1] ; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v18, vcc ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 ; CGP-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc @@ -2063,95 +2139,99 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1 +; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; CGP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v16, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v17, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v10, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] -; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v2, v10 -; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v7, v5 -; CGP-NEXT: v_xor_b32_e32 v13, v3, v10 -; CGP-NEXT: v_mul_hi_u32 v3, v7, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_mov_b32_e32 v7, v1 +; CGP-NEXT: v_xor_b32_e32 v13, v5, v9 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v11, v[7:8] +; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v3 +; CGP-NEXT: v_cndmask_b32_e32 v12, v15, v17, vcc +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[5:6] +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc +; CGP-NEXT: v_xor_b32_e32 v8, v2, v7 +; CGP-NEXT: v_mul_lo_u32 v2, v11, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v10, v5 +; CGP-NEXT: v_xor_b32_e32 v1, v12, v9 +; CGP-NEXT: v_xor_b32_e32 v12, v3, v7 +; CGP-NEXT: v_mul_hi_u32 v3, v10, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v11, v5 +; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v11, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 -; CGP-NEXT: v_mul_hi_u32 v7, v12, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v12, v3 +; CGP-NEXT: v_mul_lo_u32 v6, v8, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9 ; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v8, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v13, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 +; CGP-NEXT: v_mul_lo_u32 v9, v12, v2 +; CGP-NEXT: v_mul_hi_u32 v3, v12, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_mul_hi_u32 v6, v12, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v6, v8, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v7, v13, v2 +; CGP-NEXT: v_mul_hi_u32 v9, v12, v2 ; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v5 +; CGP-NEXT: ; implicit-def: $vgpr5_vgpr6 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 +; CGP-NEXT: v_mov_b32_e32 v5, v3 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[5:6] +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v12, v5 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v2, v4 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v4 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 -; CGP-NEXT: v_cndmask_b32_e32 v8, -1, v8, vcc -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_cndmask_b32_e32 v9, -1, v9, vcc +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 -; CGP-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v5, vcc +; CGP-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v5, vcc ; CGP-NEXT: v_cndmask_b32_e64 v6, -1, v6, s[4:5] -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; CGP-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v7 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v7 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i64> %num, ret <2 x i64> %result @@ -2195,72 +2275,74 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v7, v5 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v7 -; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v7 -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mov_b32_e32 v2, v6 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3] -; CHECK-NEXT: v_mul_lo_u32 v2, v11, v5 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v5 +; CHECK-NEXT: ; implicit-def: $vgpr7_vgpr8 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v2, 0 +; CHECK-NEXT: v_mov_b32_e32 v7, v6 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[7:8] +; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v2, v[6:7] +; CHECK-NEXT: v_mul_lo_u32 v7, v11, v5 ; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 -; CHECK-NEXT: v_mul_lo_u32 v12, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v12, v2, v6 ; CHECK-NEXT: v_mul_lo_u32 v13, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v12, v2 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CHECK-NEXT: v_mul_hi_u32 v8, v2, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v13, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v12, v8 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 -; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v5, vcc -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mov_b32_e32 v2, v6 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3] -; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v4 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v9 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v4, v9, vcc -; CHECK-NEXT: v_xor_b32_e32 v7, v2, v9 -; CHECK-NEXT: v_mul_lo_u32 v2, v11, v5 -; CHECK-NEXT: v_mul_lo_u32 v4, v8, v6 -; CHECK-NEXT: v_xor_b32_e32 v10, v3, v9 -; CHECK-NEXT: v_mul_hi_u32 v3, v8, v5 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v6, vcc +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v2, 0 +; CHECK-NEXT: ; implicit-def: $vgpr7_vgpr8 +; CHECK-NEXT: v_mov_b32_e32 v7, v6 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[7:8] +; CHECK-NEXT: v_ashrrev_i32_e32 v8, 31, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v2, v[6:7] +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v8, vcc +; CHECK-NEXT: v_xor_b32_e32 v7, v3, v8 +; CHECK-NEXT: v_mul_lo_u32 v3, v11, v5 +; CHECK-NEXT: v_mul_lo_u32 v9, v2, v6 +; CHECK-NEXT: v_xor_b32_e32 v10, v4, v8 +; CHECK-NEXT: v_mul_hi_u32 v4, v2, v5 ; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v3, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; CHECK-NEXT: v_mul_hi_u32 v4, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_mul_hi_u32 v5, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v4, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; CHECK-NEXT: v_mul_hi_u32 v9, v2, v6 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v4, vcc ; CHECK-NEXT: v_mul_lo_u32 v4, v10, v2 ; CHECK-NEXT: v_mul_lo_u32 v5, v7, v3 ; CHECK-NEXT: v_mul_hi_u32 v6, v7, v2 ; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v10, v3 +; CHECK-NEXT: v_mul_hi_u32 v9, v10, v3 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 @@ -2277,9 +2359,11 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v6, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4] +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v4 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 ; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_mov_b32_e32 v4, v3 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v9, v[4:5] ; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v6, v[3:4] ; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc ; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3 @@ -2293,24 +2377,24 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v2, v0 ; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1 ; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v8 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v8 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr3 ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -2363,255 +2447,262 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v4 ; GISEL-NEXT: v_trunc_f32_e32 v12, v10 ; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v12 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v16, v10 -; GISEL-NEXT: v_mul_hi_u32 v17, v13, v10 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] +; GISEL-NEXT: ; implicit-def: $vgpr12_vgpr13 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v4, 0 +; GISEL-NEXT: v_mov_b32_e32 v12, v11 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[12:13] +; GISEL-NEXT: v_mul_hi_u32 v17, v4, v10 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v4, v[11:12] +; GISEL-NEXT: v_mul_lo_u32 v12, v16, v10 ; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10 -; GISEL-NEXT: v_mul_lo_u32 v12, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; GISEL-NEXT: v_mul_lo_u32 v13, v4, v11 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v17, v16, v11 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; GISEL-NEXT: v_mul_hi_u32 v12, v13, v11 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v13, v4, v11 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v17, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v12 -; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v4 -; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v16, v10, vcc -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5] +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v4, v10 +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v16, v11, vcc +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v17, 0 +; GISEL-NEXT: ; implicit-def: $vgpr12_vgpr13 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] +; GISEL-NEXT: v_mov_b32_e32 v12, v11 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[12:13] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v0, v4 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v17, v[11:12] +; GISEL-NEXT: v_xor_b32_e32 v13, v0, v4 ; GISEL-NEXT: v_mul_lo_u32 v0, v16, v10 -; GISEL-NEXT: v_mul_lo_u32 v14, v13, v11 -; GISEL-NEXT: v_xor_b32_e32 v15, v1, v4 -; GISEL-NEXT: v_mul_hi_u32 v1, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v12, v17, v11 +; GISEL-NEXT: v_xor_b32_e32 v14, v1, v4 +; GISEL-NEXT: v_mul_hi_u32 v1, v17, v10 ; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_mul_hi_u32 v14, v13, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; GISEL-NEXT: v_mul_hi_u32 v12, v17, v11 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v17, v0 ; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v16, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v15, v10 -; GISEL-NEXT: v_mul_lo_u32 v14, v12, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v14, v10 +; GISEL-NEXT: v_mul_lo_u32 v15, v13, v11 ; GISEL-NEXT: v_lshl_b64 v[0:1], v[8:9], v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v12, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v14 +; GISEL-NEXT: v_mul_hi_u32 v6, v13, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v14, v10 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v15, v11 +; GISEL-NEXT: v_mul_lo_u32 v8, v14, v11 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_mul_hi_u32 v9, v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v9, v13, v11 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v8, v6 -; GISEL-NEXT: v_mul_hi_u32 v8, v15, v11 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v8, v6 +; GISEL-NEXT: v_mul_hi_u32 v8, v14, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v8, v6 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v8, v6 ; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v13, 0 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v15, 0 ; GISEL-NEXT: v_xor_b32_e32 v6, v0, v8 ; GISEL-NEXT: v_xor_b32_e32 v8, v1, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v16, v8 -; GISEL-NEXT: v_mov_b32_e32 v0, v10 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, v[0:1] -; GISEL-NEXT: v_mac_f32_e32 v14, 0x4f800000, v16 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v14 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v13, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v17, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v18, v8 +; GISEL-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GISEL-NEXT: v_mov_b32_e32 v11, v10 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v16, v[11:12] +; GISEL-NEXT: v_mac_f32_e32 v17, 0x4f800000, v18 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v17 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v15, v[0:1] +; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v6 ; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v10 ; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v13, v10 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v1 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 -; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v14, 0 -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v12, v9 -; GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v13, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v1, v13, v10 -; GISEL-NEXT: v_subb_u32_e64 v18, s[4:5], v15, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v14, v[11:12] -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v12, v14, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v18, v7 +; GISEL-NEXT: v_trunc_f32_e32 v12, v10 +; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v12 +; GISEL-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v12 +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v1, 0 +; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: ; implicit-def: $vgpr12_vgpr13 +; GISEL-NEXT: v_mov_b32_e32 v12, v11 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v17, v[12:13] +; GISEL-NEXT: v_subb_u32_e64 v13, s[4:5], v14, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v1, v[11:12] +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v12, v17, v10 +; GISEL-NEXT: v_mul_lo_u32 v14, v1, v11 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v13, v7 ; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v14, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v5 +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v1, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_hi_u32 v10, v17, v10 +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v12, v1, v12, s[6:7] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v9, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v13, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[6:7] +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v9, v5 ; GISEL-NEXT: v_subbrev_u32_e64 v19, s[6:7], 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v14, v5 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v19, v7 ; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v7 -; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v1, v5 +; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v14, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7] ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v7, v19, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_hi_u32 v1, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v15, v14, v11 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GISEL-NEXT: v_mul_lo_u32 v14, v17, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v18, v1, v11 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18 +; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 -; GISEL-NEXT: v_mul_hi_u32 v11, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v11, 0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v1, v0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v11, 0 +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v17, v10, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v11, v[9:10] -; GISEL-NEXT: v_cndmask_b32_e32 v7, v18, v7, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v2, v5 -; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v10, v11, v9 -; GISEL-NEXT: v_xor_b32_e32 v14, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GISEL-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 +; GISEL-NEXT: v_mov_b32_e32 v9, v1 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v14, v[9:10] +; GISEL-NEXT: v_cndmask_b32_e32 v7, v13, v7, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v12 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v11, v[9:10] +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v12, vcc +; GISEL-NEXT: v_xor_b32_e32 v13, v1, v12 +; GISEL-NEXT: v_mul_lo_u32 v1, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v3, v11, v9 +; GISEL-NEXT: v_xor_b32_e32 v15, v2, v12 +; GISEL-NEXT: v_mul_hi_u32 v2, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v2, v14, v9 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GISEL-NEXT: v_mul_hi_u32 v3, v11, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v9 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; GISEL-NEXT: v_mul_hi_u32 v10, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v9, v13, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GISEL-NEXT: v_mul_hi_u32 v3, v14, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v14, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v2, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v3, v13, v1 +; GISEL-NEXT: v_mul_hi_u32 v9, v13, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_xor_b32_e32 v5, v5, v4 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v15, v1 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2 -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; GISEL-NEXT: v_mul_hi_u32 v3, v13, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v14, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 -; GISEL-NEXT: v_mul_hi_u32 v9, v12, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v10, v14, v2 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v3 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v2 +; GISEL-NEXT: v_mul_hi_u32 v1, v15, v1 ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v1, v0 +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 ; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v9, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v5, v4 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v4, vcc ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v11, v[9:10] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v13, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v8 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v2, v6 -; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v6 +; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v6 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v8 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v8 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v5 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v5 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_pow2_shl_denom: @@ -2647,72 +2738,74 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v12, v10 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v12 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v16, v12 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 -; CGP-NEXT: v_mov_b32_e32 v4, v11 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v16, v10 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] -; CGP-NEXT: v_mul_hi_u32 v12, v13, v10 +; CGP-NEXT: ; implicit-def: $vgpr12_vgpr13 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v4, 0 +; CGP-NEXT: v_mov_b32_e32 v12, v11 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[12:13] +; CGP-NEXT: v_mul_hi_u32 v13, v4, v10 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v4, v[11:12] +; CGP-NEXT: v_mul_lo_u32 v12, v16, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 -; CGP-NEXT: v_mul_lo_u32 v17, v13, v11 +; CGP-NEXT: v_mul_lo_u32 v17, v4, v11 ; CGP-NEXT: v_mul_lo_u32 v18, v16, v11 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v17 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v17 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v13, v11 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v17, v4 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v4, v11 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v17, v12 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v18, v10 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v17, v13 +; CGP-NEXT: v_mul_hi_u32 v11, v16, v11 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v17, v12 -; CGP-NEXT: v_mul_hi_u32 v11, v16, v11 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v4 -; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v10, vcc -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 -; CGP-NEXT: v_mov_b32_e32 v4, v11 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v14 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v14, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v4, v14 -; CGP-NEXT: v_mul_lo_u32 v4, v16, v10 -; CGP-NEXT: v_mul_lo_u32 v9, v13, v11 -; CGP-NEXT: v_xor_b32_e32 v15, v8, v14 -; CGP-NEXT: v_mul_hi_u32 v8, v13, v10 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v11, vcc +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v4, 0 +; CGP-NEXT: ; implicit-def: $vgpr12_vgpr13 +; CGP-NEXT: v_mov_b32_e32 v12, v11 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[12:13] +; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v13 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v4, v[11:12] +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v12, v8, v13 +; CGP-NEXT: v_mul_lo_u32 v8, v16, v10 +; CGP-NEXT: v_mul_lo_u32 v14, v4, v11 +; CGP-NEXT: v_xor_b32_e32 v15, v9, v13 +; CGP-NEXT: v_mul_hi_u32 v9, v4, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v16, v11 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v9, v13, v11 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v16, v11 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v16, v11 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v14, v8 +; CGP-NEXT: v_mul_hi_u32 v14, v4, v11 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; CGP-NEXT: v_mul_hi_u32 v11, v16, v11 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v16, v8, vcc +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v16, v9, vcc ; CGP-NEXT: v_mul_lo_u32 v9, v15, v4 ; CGP-NEXT: v_mul_lo_u32 v10, v12, v8 ; CGP-NEXT: v_mul_hi_u32 v11, v12, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v13, v15, v8 +; CGP-NEXT: v_mul_hi_u32 v14, v15, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 @@ -2725,15 +2818,16 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v11, 0 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v4 -; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v10, v[4:5] +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v4, 0 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v10 +; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 +; CGP-NEXT: v_mov_b32_e32 v10, v9 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v14, v[10:11] +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v1, v4, v[9:10] ; CGP-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v1, v11, v[9:10] ; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v15, v9, vcc ; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v15, v9 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1 @@ -2746,24 +2840,24 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_sub_i32_e32 v11, vcc, v4, v0 ; CGP-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v9, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v1 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v0 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v1 ; CGP-NEXT: v_subb_u32_e32 v1, vcc, v9, v1, vcc ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v0 -; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v15, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; CGP-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v12, v1, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v14 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v14 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v14 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v14, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v13 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v13 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v13 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v13, vcc ; CGP-NEXT: ; implicit-def: $vgpr11_vgpr12 ; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: .LBB8_2: ; %Flow1 @@ -2814,6 +2908,7 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 ; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v2 +; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 ; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 @@ -2821,92 +2916,95 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v6, v6 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v11, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 -; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v6, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v10, v11, v8 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v4, 0 +; CGP-NEXT: v_mov_b32_e32 v10, v9 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[10:11] +; CGP-NEXT: v_mul_hi_u32 v11, v4, v8 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v4, v[9:10] +; CGP-NEXT: v_mul_lo_u32 v10, v6, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_mul_lo_u32 v14, v11, v9 +; CGP-NEXT: v_mul_lo_u32 v14, v4, v9 ; CGP-NEXT: v_mul_lo_u32 v15, v6, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v4, v9 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v14, v10 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v14, v11 +; CGP-NEXT: v_mul_hi_u32 v9, v6, v9 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; CGP-NEXT: v_mul_hi_u32 v9, v6, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v4 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 -; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v12 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v7, v4, v12 -; CGP-NEXT: v_mul_lo_u32 v4, v6, v8 -; CGP-NEXT: v_mul_lo_u32 v10, v11, v9 -; CGP-NEXT: v_xor_b32_e32 v13, v5, v12 -; CGP-NEXT: v_mul_hi_u32 v5, v11, v8 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v9, vcc +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v4, 0 +; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 +; CGP-NEXT: v_mov_b32_e32 v10, v9 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[10:11] +; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v4, v[9:10] +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v11, vcc +; CGP-NEXT: v_xor_b32_e32 v10, v5, v11 +; CGP-NEXT: v_mul_lo_u32 v5, v6, v8 +; CGP-NEXT: v_mul_lo_u32 v12, v4, v9 +; CGP-NEXT: v_xor_b32_e32 v13, v7, v11 +; CGP-NEXT: v_mul_hi_u32 v7, v4, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v6, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v7, v6, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; CGP-NEXT: v_mul_hi_u32 v12, v4, v9 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 ; CGP-NEXT: v_mul_hi_u32 v9, v6, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v6, v7, vcc ; CGP-NEXT: v_mul_lo_u32 v6, v13, v4 -; CGP-NEXT: v_mul_lo_u32 v8, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v8, v10, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v13, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v13, v5 +; CGP-NEXT: v_mul_hi_u32 v9, v13, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v13, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_mul_hi_u32 v8, v7, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_mul_lo_u32 v8, v13, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_mul_hi_u32 v7, v10, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v4, v6 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v4, v6 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v8, 0 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6] -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[5:6] +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v6 +; CGP-NEXT: ; implicit-def: $vgpr6_vgpr7 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_mov_b32_e32 v6, v5 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v9, v[6:7] +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v8, v[5:6] ; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v13, v5, vcc ; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 @@ -2921,11 +3019,11 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 ; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 -; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc @@ -2933,10 +3031,10 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v12 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v12 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v11 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v11 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v11 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v11, vcc ; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; CGP-NEXT: ; implicit-def: $vgpr5 ; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -3033,199 +3131,205 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 ; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GISEL-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v3 ; GISEL-NEXT: v_trunc_f32_e32 v5, v5 ; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0 -; GISEL-NEXT: v_mov_b32_e32 v3, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v5, v[3:4] -; GISEL-NEXT: v_mul_lo_u32 v3, v5, v7 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9] -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v7 +; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v3, 0 +; GISEL-NEXT: v_mov_b32_e32 v9, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v5, v[9:10] +; GISEL-NEXT: v_mul_hi_u32 v10, v3, v7 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v3, v[8:9] +; GISEL-NEXT: v_mul_lo_u32 v9, v5, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 -; GISEL-NEXT: v_mul_lo_u32 v13, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v13, v3, v8 ; GISEL-NEXT: v_mul_lo_u32 v14, v5, v8 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v13, v3 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v3, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 ; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v3 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0 -; GISEL-NEXT: v_mov_b32_e32 v3, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v5, v[3:4] -; GISEL-NEXT: v_mul_lo_u32 v3, v5, v7 -; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v0 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9] -; GISEL-NEXT: v_mul_hi_u32 v0, v10, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v5, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v3, 0 +; GISEL-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GISEL-NEXT: v_mov_b32_e32 v9, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v5, v[9:10] +; GISEL-NEXT: v_mul_hi_u32 v11, v3, v7 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v3, v[8:9] +; GISEL-NEXT: v_mul_lo_u32 v9, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 +; GISEL-NEXT: v_mul_lo_u32 v10, v3, v8 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_mul_hi_u32 v10, v3, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v3, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, 0, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v3, v7 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, 0, v7 +; GISEL-NEXT: v_mul_lo_u32 v9, v0, v5 ; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; GISEL-NEXT: v_mul_lo_u32 v8, 0, v5 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_mul_hi_u32 v7, v11, v5 +; GISEL-NEXT: v_mul_hi_u32 v6, v0, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_mul_lo_u32 v9, 0, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_mul_hi_u32 v8, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v7, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, 0, v5 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v9, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v0, v6 -; GISEL-NEXT: v_mul_hi_u32 v9, 0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v8, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v7, v[0:1] -; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v8, v[6:7] -; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v4 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v11, v5 -; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], 0, v6, vcc -; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v7 +; GISEL-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GISEL-NEXT: v_mov_b32_e32 v7, v6 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v10, v[7:8] +; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v3 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v9, v[6:7] +; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v4 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v8 +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v0, v5 +; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], 0, v6, vcc +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v9, v4 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v0 -; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], 0, v3 -; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GISEL-NEXT: v_trunc_f32_e32 v7, v4 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 0, v3 +; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v0, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v7 ; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v6 -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[0:1] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v14, -1, v0, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v0, v9, v4 -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 -; GISEL-NEXT: v_mul_hi_u32 v15, v10, v4 +; GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GISEL-NEXT: v_mul_hi_u32 v15, v0, v4 +; GISEL-NEXT: v_mov_b32_e32 v6, v5 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[6:7] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v0, v[5:6] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v14, -1, v7, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v6, v12, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v0, v5 ; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v13, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 +; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v9, v5 -; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5 +; GISEL-NEXT: v_mul_lo_u32 v15, v12, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v0, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v15, v7 +; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, 0 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v7, v1 -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[0:1] +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v0, 0 +; GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v8, v1 +; GISEL-NEXT: v_mov_b32_e32 v6, v5 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[6:7] ; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v13, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6] ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v15, v1 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v0, v[5:6] ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v7, -1, v16, vcc ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v1 ; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v13, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v4 -; GISEL-NEXT: v_mul_lo_u32 v12, v10, v5 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v10, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v11, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v5 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v12, v10, v5 -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v11, v4 +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v4 +; GISEL-NEXT: v_mul_lo_u32 v11, v0, v5 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v15, v1, vcc +; GISEL-NEXT: v_mul_hi_u32 v1, v0, v4 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v4, v0 +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v10, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4 +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v11, v1 +; GISEL-NEXT: v_mul_hi_u32 v11, v0, v5 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v10, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 +; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5 +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v4, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v11, v4 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v10, v4 ; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v10, v0 -; GISEL-NEXT: v_addc_u32_e64 v4, s[4:5], v9, v4, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v5, 0, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v2, v4 +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v1 +; GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], v12, v4, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v2, v1 ; GISEL-NEXT: v_cndmask_b32_e32 v10, v13, v6, vcc ; GISEL-NEXT: v_mul_hi_u32 v6, v2, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; GISEL-NEXT: v_mul_lo_u32 v9, 0, v4 ; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v2, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_mul_lo_u32 v5, 0, v1 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v2, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v11, 0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v4 +; GISEL-NEXT: v_mul_hi_u32 v1, 0, v1 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v11, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v0 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v1, v0 +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; GISEL-NEXT: v_mov_b32_e32 v0, v5 ; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v6, v[0:1] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v11, v[5:6] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 ; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/twoaddr-extract-dyn-v7f64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/twoaddr-extract-dyn-v7f64.mir index 75148ecff5377..69504402892e7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/twoaddr-extract-dyn-v7f64.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/twoaddr-extract-dyn-v7f64.mir @@ -44,34 +44,35 @@ body: | ; CHECK-NEXT: undef [[COPY20:%[0-9]+]].sub0:vreg_64 = COPY [[COPY12]] ; CHECK-NEXT: [[COPY20:%[0-9]+]].sub1:vreg_64 = COPY [[COPY13]] ; CHECK-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY $vgpr14 - ; CHECK-NEXT: undef [[COPY22:%[0-9]+]].sub0_sub1:vreg_512 = COPY [[COPY14]] - ; CHECK-NEXT: [[COPY22:%[0-9]+]].sub2_sub3:vreg_512 = COPY [[COPY15]] - ; CHECK-NEXT: [[COPY22:%[0-9]+]].sub4_sub5:vreg_512 = COPY [[COPY16]] - ; CHECK-NEXT: [[COPY22:%[0-9]+]].sub6_sub7:vreg_512 = COPY [[COPY17]] - ; CHECK-NEXT: [[COPY22:%[0-9]+]].sub8_sub9:vreg_512 = COPY [[COPY18]] - ; CHECK-NEXT: [[COPY22:%[0-9]+]].sub10_sub11:vreg_512 = COPY [[COPY19]] - ; CHECK-NEXT: [[COPY22:%[0-9]+]].sub12_sub13:vreg_512 = COPY [[COPY20]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_512 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF:%[0-9]+]].sub0_sub1:vreg_512 = COPY [[COPY14]] + ; CHECK-NEXT: [[DEF:%[0-9]+]].sub2_sub3:vreg_512 = COPY [[COPY15]] + ; CHECK-NEXT: [[DEF:%[0-9]+]].sub4_sub5:vreg_512 = COPY [[COPY16]] + ; CHECK-NEXT: [[DEF:%[0-9]+]].sub6_sub7:vreg_512 = COPY [[COPY17]] + ; CHECK-NEXT: [[DEF:%[0-9]+]].sub8_sub9:vreg_512 = COPY [[COPY18]] + ; CHECK-NEXT: [[DEF:%[0-9]+]].sub10_sub11:vreg_512 = COPY [[COPY19]] + ; CHECK-NEXT: [[DEF:%[0-9]+]].sub12_sub13:vreg_512 = COPY [[COPY20]] ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 1, [[COPY21]], implicit $exec - ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY22]].sub0, 0, [[COPY22]].sub2, [[V_CMP_EQ_U32_e64_]], implicit $exec - ; CHECK-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY22]].sub1, 0, [[COPY22]].sub3, [[V_CMP_EQ_U32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[DEF]].sub0, 0, [[DEF]].sub2, [[V_CMP_EQ_U32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[DEF]].sub1, 0, [[DEF]].sub3, [[V_CMP_EQ_U32_e64_]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 2, [[COPY21]], implicit $exec - ; CHECK-NEXT: [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_]], 0, [[COPY22]].sub4, [[V_CMP_EQ_U32_e64_1]], implicit $exec - ; CHECK-NEXT: [[V_CNDMASK_B32_e64_3:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_1]], 0, [[COPY22]].sub5, [[V_CMP_EQ_U32_e64_1]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_]], 0, [[DEF]].sub4, [[V_CMP_EQ_U32_e64_1]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_3:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_1]], 0, [[DEF]].sub5, [[V_CMP_EQ_U32_e64_1]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_2:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 3, [[COPY21]], implicit $exec - ; CHECK-NEXT: [[V_CNDMASK_B32_e64_4:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_2]], 0, [[COPY22]].sub6, [[V_CMP_EQ_U32_e64_2]], implicit $exec - ; CHECK-NEXT: [[V_CNDMASK_B32_e64_5:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_3]], 0, [[COPY22]].sub7, [[V_CMP_EQ_U32_e64_2]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_4:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_2]], 0, [[DEF]].sub6, [[V_CMP_EQ_U32_e64_2]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_5:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_3]], 0, [[DEF]].sub7, [[V_CMP_EQ_U32_e64_2]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 4, [[COPY21]], implicit $exec - ; CHECK-NEXT: [[V_CNDMASK_B32_e64_6:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_4]], 0, [[COPY22]].sub8, [[V_CMP_EQ_U32_e64_3]], implicit $exec - ; CHECK-NEXT: [[V_CNDMASK_B32_e64_7:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_5]], 0, [[COPY22]].sub9, [[V_CMP_EQ_U32_e64_3]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_6:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_4]], 0, [[DEF]].sub8, [[V_CMP_EQ_U32_e64_3]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_7:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_5]], 0, [[DEF]].sub9, [[V_CMP_EQ_U32_e64_3]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_4:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 5, [[COPY21]], implicit $exec - ; CHECK-NEXT: [[V_CNDMASK_B32_e64_8:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_6]], 0, [[COPY22]].sub10, [[V_CMP_EQ_U32_e64_4]], implicit $exec - ; CHECK-NEXT: [[V_CNDMASK_B32_e64_9:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_7]], 0, [[COPY22]].sub11, [[V_CMP_EQ_U32_e64_4]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_8:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_6]], 0, [[DEF]].sub10, [[V_CMP_EQ_U32_e64_4]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_9:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_7]], 0, [[DEF]].sub11, [[V_CMP_EQ_U32_e64_4]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_5:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 6, [[COPY21]], implicit $exec - ; CHECK-NEXT: [[V_CNDMASK_B32_e64_10:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_8]], 0, [[COPY22]].sub12, [[V_CMP_EQ_U32_e64_5]], implicit $exec - ; CHECK-NEXT: [[V_CNDMASK_B32_e64_11:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_9]], 0, [[COPY22]].sub13, [[V_CMP_EQ_U32_e64_5]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_10:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_8]], 0, [[DEF]].sub12, [[V_CMP_EQ_U32_e64_5]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_11:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_9]], 0, [[DEF]].sub13, [[V_CMP_EQ_U32_e64_5]], implicit $exec ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_6:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 7, [[COPY21]], implicit $exec - ; CHECK-NEXT: [[V_CNDMASK_B32_e64_12:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_10]], 0, undef [[COPY22]].sub14, [[V_CMP_EQ_U32_e64_6]], implicit $exec - ; CHECK-NEXT: [[V_CNDMASK_B32_e64_13:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_11]], 0, undef [[COPY22]].sub15, [[V_CMP_EQ_U32_e64_6]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_12:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_10]], 0, undef [[DEF]].sub14, [[V_CMP_EQ_U32_e64_6]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_13:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_11]], 0, undef [[DEF]].sub15, [[V_CMP_EQ_U32_e64_6]], implicit $exec ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_12]] ; CHECK-NEXT: $vgpr1 = COPY [[V_CNDMASK_B32_e64_13]] ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll index d9158e3558395..59f6df4b52162 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -645,10 +645,12 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v2, 8, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, v1 clamp ; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v1, 8, v6 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v2, v1 clamp +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h ; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 @@ -894,6 +896,8 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s0, s1 clamp +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h ; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index ba5a8e9c68a1f..d4547b689b67b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -132,60 +132,64 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_trunc_f32_e32 v2, v1 ; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v2 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v4, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v5, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v3, v4, v0 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v2, v5, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v5, v0 +; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1 +; GFX8-NEXT: v_mul_lo_u32 v7, v5, v1 +; GFX8-NEXT: v_mul_hi_u32 v8, v4, v1 +; GFX8-NEXT: v_mul_hi_u32 v1, v5, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v4, 0 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v5, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v6, v4, v0 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v2, v5, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v5, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1 +; GFX8-NEXT: v_mul_lo_u32 v6, v5, v1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v4, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v6, v3 +; GFX8-NEXT: v_mul_hi_u32 v1, v5, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc ; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0 ; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1 ; GFX8-NEXT: v_mul_hi_u32 v4, s8, v0 @@ -207,10 +211,12 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v3, v[1:2] +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v2 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: v_mov_b32_e32 v6, s9 -; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v5, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, s11 ; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2] ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s8, v0 ; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v1, vcc @@ -220,31 +226,31 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v6 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[0:1] ; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s10, v2 ; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v0, vcc ; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v4 -; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] +; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v5, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v7 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v8 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s10, v7 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s10, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9 ; GFX8-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v0, vcc ; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1] ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v9, v10, v13, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v10, v13, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v0, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v9, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v14, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, v9, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v4, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v4, s4 @@ -271,59 +277,63 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v2 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v5, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v3, v4, v0 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v2, v5, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v5, v0 +; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1 +; GFX9-NEXT: v_mul_lo_u32 v7, v5, v1 +; GFX9-NEXT: v_mul_hi_u32 v8, v4, v1 +; GFX9-NEXT: v_mul_hi_u32 v1, v5, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 ; GFX9-NEXT: v_add_u32_e32 v2, v6, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX9-NEXT: v_add_u32_e32 v3, v7, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 +; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v4, 0 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: v_mov_b32_e32 v7, s19 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v5, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v6, v4, v0 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v2, v5, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v5, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX9-NEXT: v_mul_lo_u32 v6, v5, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v4, v1 +; GFX9-NEXT: v_mul_hi_u32 v1, v5, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX9-NEXT: v_add_u32_e32 v3, v6, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, s17, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s16, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s16, v0 @@ -344,25 +354,27 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v5, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s18, v3, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v6, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_add3_u32 v4, v3, v2, v6 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s18, v4, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, s17 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s19, v5, v[1:2] ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s16, v0 -; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v6 +; GFX9-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v1, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v3 ; GFX9-NEXT: v_sub_u32_e32 v0, s17, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v6 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v3 ; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[0:1] ; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s18, v2 ; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v0, vcc ; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1] +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v4, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v8 @@ -379,18 +391,19 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v14, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v10, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v4, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v15, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v5, s[0:1] -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] +; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[12:13] +; GFX9-NEXT: global_store_dwordx2 v6, v[2:3], s[14:15] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s19 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s18 @@ -400,61 +413,64 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX10-NEXT: v_trunc_f32_e32 v2, v1 -; GFX10-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 -; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v2 +; GFX10-NEXT: v_trunc_f32_e32 v4, v1 +; GFX10-NEXT: v_mul_f32_e32 v1, 0xcf800000, v4 +; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX10-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s0, v3, 0 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s1, s0, v4, v[1:2] -; GFX10-NEXT: s_subb_u32 s1, 0, s19 +; GFX10-NEXT: v_cvt_u32_f32_e32 v5, v0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s0, v5, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 ; GFX10-NEXT: v_mul_hi_u32 v6, v4, v0 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, s1, v3, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s1, s0, v4, v[2:3] +; GFX10-NEXT: s_subb_u32 s1, 0, s19 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, s1, v5, v[1:2] ; GFX10-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX10-NEXT: v_mul_hi_u32 v0, v3, v0 -; GFX10-NEXT: v_mul_lo_u32 v5, v3, v1 +; GFX10-NEXT: v_mul_hi_u32 v0, v5, v0 +; GFX10-NEXT: v_mul_lo_u32 v3, v5, v1 ; GFX10-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX10-NEXT: v_mul_hi_u32 v8, v3, v1 +; GFX10-NEXT: v_mul_hi_u32 v8, v5, v1 ; GFX10-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX10-NEXT: v_add_co_u32 v2, s2, v2, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s2 +; GFX10-NEXT: v_add_co_u32 v2, s2, v2, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 ; GFX10-NEXT: v_add_co_u32 v6, s2, v7, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s2 ; GFX10-NEXT: v_add_co_u32 v0, s2, v2, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX10-NEXT: v_add_co_u32 v2, s2, v6, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v5, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v7, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v3, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v7, v6 ; GFX10-NEXT: v_add_co_u32 v0, s2, v2, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v3, v0 -; GFX10-NEXT: v_add3_u32 v1, v5, v2, v1 +; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v5, v0 +; GFX10-NEXT: v_add3_u32 v1, v3, v2, v1 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v1, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, s0, v3, 0 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s0, v4, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, s0, v5, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 ; GFX10-NEXT: v_mul_hi_u32 v6, v4, v0 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s1, v3, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s0, v4, v[2:3] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s1, v5, v[1:2] ; GFX10-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX10-NEXT: v_mul_hi_u32 v0, v3, v0 -; GFX10-NEXT: v_mul_lo_u32 v5, v3, v1 +; GFX10-NEXT: v_mul_hi_u32 v0, v5, v0 +; GFX10-NEXT: v_mul_lo_u32 v3, v5, v1 ; GFX10-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX10-NEXT: v_mul_hi_u32 v8, v3, v1 +; GFX10-NEXT: v_mul_hi_u32 v8, v5, v1 ; GFX10-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v5 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v6, s0, v7, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v0, s0, v2, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v2, s0, v6, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v5, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v7, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v3, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v7, v6 ; GFX10-NEXT: v_add_co_u32 v0, s0, v2, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v3, v0 -; GFX10-NEXT: v_add3_u32 v1, v5, v2, v1 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v5, v0 +; GFX10-NEXT: v_add3_u32 v1, v3, v2, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, s17, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo ; GFX10-NEXT: v_mul_hi_u32 v4, s16, v0 @@ -471,16 +487,18 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_add_nc_u32_e32 v2, v6, v2 ; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX10-NEXT: v_mul_hi_u32 v6, s17, v1 ; GFX10-NEXT: v_add_co_u32 v5, s0, v0, v2 -; GFX10-NEXT: v_mul_hi_u32 v2, s17, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v4, v3 +; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s18, v5, 0 -; GFX10-NEXT: v_add3_u32 v3, v3, v6, v2 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s18, v3, v[1:2] +; GFX10-NEXT: v_add3_u32 v4, v4, v7, v6 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s18, v4, v[2:3] ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s19, v5, v[1:2] ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v5, 1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v4, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v7, vcc_lo, s16, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, s17, v1 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v8, s0, s17, v1, vcc_lo @@ -497,7 +515,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s19, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0 ; GFX10-NEXT: v_add_co_u32 v13, s0, v2, 1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s0, 0, v4, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s0, 0, v3, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s19, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v11, v12, v11, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s19, v8 @@ -507,13 +525,13 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v0, s0, 0, v0, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v13, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v14, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v6, v9, v0, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v9, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, v1, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v4, v1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v3, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v8, v6, s0 ; GFX10-NEXT: global_store_dwordx2 v9, v[0:1], s[12:13] ; GFX10-NEXT: global_store_dwordx2 v9, v[2:3], s[14:15] @@ -1010,66 +1028,71 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_trunc_f32_e32 v2, v1 ; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v2 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v4, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v5, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v3, v4, v0 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v2, v5, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v5, v0 +; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1 +; GFX8-NEXT: v_mul_lo_u32 v7, v5, v1 +; GFX8-NEXT: v_mul_hi_u32 v8, v4, v1 +; GFX8-NEXT: v_mul_hi_u32 v1, v5, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v7, v3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v4, 0 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v5, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v6, v4, v0 ; GFX8-NEXT: s_sub_u32 s2, 0, s14 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v2, v5, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v5, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, v4, v1 ; GFX8-NEXT: s_subb_u32 s3, 0, s15 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1 +; GFX8-NEXT: v_mul_lo_u32 v6, v5, v1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v4, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v6, v3 +; GFX8-NEXT: v_mul_hi_u32 v1, v5, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc ; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0 ; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1 ; GFX8-NEXT: v_mul_hi_u32 v4, s8, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, s13 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 @@ -1088,11 +1111,12 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v7, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v3, s9 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s8, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v7, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s13, v6, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v4, s13 ; GFX8-NEXT: v_subb_u32_e64 v0, s[0:1], v3, v1, vcc ; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s9, v1 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v0 @@ -1103,94 +1127,97 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v3, s[0:1] ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s15 ; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s14 -; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v4, vcc +; GFX8-NEXT: v_subb_u32_e32 v10, vcc, v1, v5, vcc ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f800000, v2 ; GFX8-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s12, v8 -; GFX8-NEXT: v_subbrev_u32_e64 v11, s[0:1], 0, v5, vcc +; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, s12, v8 +; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v10, vcc ; GFX8-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GFX8-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 ; GFX8-NEXT: v_trunc_f32_e32 v3, v2 ; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v3 ; GFX8-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v1 -; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v6 -; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v7, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v12, 0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v3 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, v[2:3] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v10 +; GFX8-NEXT: v_cvt_u32_f32_e32 v13, v1 +; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v6 +; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v7, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v13, 0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v16, v3 +; GFX8-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v12 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v12, v[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v3, v15, v1 -; GFX8-NEXT: v_mul_lo_u32 v17, v12, v2 -; GFX8-NEXT: v_mul_hi_u32 v5, v12, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, v15, v1 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v17 -; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v16, v[3:4] +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v11 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v13, v[2:3] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v12 +; GFX8-NEXT: v_cndmask_b32_e64 v17, v17, v4, s[0:1] +; GFX8-NEXT: v_mul_lo_u32 v3, v16, v1 +; GFX8-NEXT: v_mul_lo_u32 v4, v13, v2 +; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v10, v5, vcc +; GFX8-NEXT: v_mul_hi_u32 v10, v13, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v5, v15, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v17, v3 -; GFX8-NEXT: v_mul_hi_u32 v17, v12, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v5, v1 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v17 -; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v17 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 1, v13 -; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v14, vcc -; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v10 -; GFX8-NEXT: v_mul_hi_u32 v2, v15, v2 -; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v4, vcc +; GFX8-NEXT: v_mul_lo_u32 v10, v16, v2 +; GFX8-NEXT: v_mul_hi_u32 v1, v16, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 +; GFX8-NEXT: v_mul_hi_u32 v4, v13, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v10, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v10, v4 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 1, v14 +; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v15, vcc +; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v11 +; GFX8-NEXT: v_mul_hi_u32 v2, v16, v2 +; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v5, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 -; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v1 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v12, 0 -; GFX8-NEXT: v_addc_u32_e32 v15, vcc, v15, v2, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v13, v17, vcc +; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v1 +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v13, 0 +; GFX8-NEXT: v_addc_u32_e32 v16, vcc, v16, v2, vcc +; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 ; GFX8-NEXT: v_mov_b32_e32 v1, v4 -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, v[1:2] -; GFX8-NEXT: v_cndmask_b32_e32 v13, v14, v18, vcc +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v16, v[1:2] +; GFX8-NEXT: v_cndmask_b32_e32 v10, v14, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v13, v[4:5] ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v12, v[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v6, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v13, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v19, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, v15, v3 -; GFX8-NEXT: v_mul_lo_u32 v9, v12, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v14, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v11, v19, vcc +; GFX8-NEXT: v_mul_lo_u32 v7, v16, v3 +; GFX8-NEXT: v_mul_lo_u32 v9, v13, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[0:1] -; GFX8-NEXT: v_mul_hi_u32 v8, v12, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v11, v20, vcc +; GFX8-NEXT: v_mul_hi_u32 v8, v13, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v6, v10, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v6, v12, v20, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v8, v15, v4 -; GFX8-NEXT: v_mul_hi_u32 v3, v15, v3 +; GFX8-NEXT: v_mul_lo_u32 v8, v16, v4 +; GFX8-NEXT: v_mul_hi_u32 v3, v16, v3 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 -; GFX8-NEXT: v_mul_hi_u32 v9, v12, v4 +; GFX8-NEXT: v_mul_hi_u32 v9, v13, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v8, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v9 -; GFX8-NEXT: v_mul_hi_u32 v4, v15, v4 +; GFX8-NEXT: v_mul_hi_u32 v4, v16, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v12, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v15, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v13, v3 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v16, v4, vcc ; GFX8-NEXT: v_mul_lo_u32 v7, s11, v3 ; GFX8-NEXT: v_mul_lo_u32 v8, s10, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v0, v6, s[0:1] @@ -1208,54 +1235,55 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v3, v0 -; GFX8-NEXT: v_mul_hi_u32 v8, s11, v4 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s14, v9, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v8, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s14, v10, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v4, s11 -; GFX8-NEXT: v_mov_b32_e32 v0, s15 -; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s15, v9, v[7:8] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_mul_hi_u32 v9, s11, v4 +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s14, v0, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; GFX8-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX8-NEXT: v_mov_b32_e32 v10, s11 +; GFX8-NEXT: v_mov_b32_e32 v7, v4 +; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s14, v9, v[7:8] +; GFX8-NEXT: v_mov_b32_e32 v4, s15 +; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s15, v0, v[7:8] ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s10, v3 -; GFX8-NEXT: v_subb_u32_e64 v11, s[0:1], v4, v7, vcc +; GFX8-NEXT: v_subb_u32_e64 v10, s[0:1], v10, v7, vcc ; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s11, v7 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v8 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v11 -; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] -; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s14, v8 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v10 +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v11, s[0:1] +; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, s14, v8 ; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v3, vcc ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v12 ; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v11 ; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v12 ; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v9 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v3, v0, vcc -; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v10, s[0:1] -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v14 +; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v0 +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v9, s[0:1] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v14 ; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s14, v7 -; GFX8-NEXT: v_subbrev_u32_e64 v0, s[0:1], 0, v0, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v10, v14, s[0:1] +; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s14, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v14, v4, vcc +; GFX8-NEXT: v_subbrev_u32_e64 v14, s[0:1], 0, v3, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v16, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v3, v0, v4, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v9, v15, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v11, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, v12, v14, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v7, v8, v0, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v10, v9, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v10, s5 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc ; GFX8-NEXT: v_mov_b32_e32 v9, s4 -; GFX8-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v11, v0, s[0:1] ; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[1:4] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 @@ -1279,60 +1307,64 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v2 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v5, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v3, v4, v0 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v2, v5, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v5, v0 +; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1 +; GFX9-NEXT: v_mul_lo_u32 v7, v5, v1 +; GFX9-NEXT: v_mul_hi_u32 v8, v4, v1 +; GFX9-NEXT: v_mul_hi_u32 v1, v5, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 ; GFX9-NEXT: v_add_u32_e32 v2, v6, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX9-NEXT: v_add_u32_e32 v3, v7, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 +; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v4, 0 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v5, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v6, v4, v0 ; GFX9-NEXT: s_sub_u32 s2, 0, s6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v2, v5, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v5, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v4, v1 ; GFX9-NEXT: s_subb_u32 s3, 0, s7 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX9-NEXT: v_mul_lo_u32 v6, v5, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v4, v1 +; GFX9-NEXT: v_mul_hi_u32 v1, v5, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX9-NEXT: v_add_u32_e32 v3, v6, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, s17, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s16, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s16, v0 @@ -1345,6 +1377,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_lo_u32 v4, s17, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, s16, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, s5 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 @@ -1354,9 +1387,10 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_add3_u32 v8, v3, v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s4, v8, v[1:2] +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: v_mov_b32_e32 v4, s17 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s4, v8, v[2:3] ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v7, v[1:2] ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s16, v0 ; GFX9-NEXT: v_subb_co_u32_e64 v0, s[0:1], v4, v2, vcc @@ -1369,81 +1403,84 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s7 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6 ; GFX9-NEXT: v_sub_u32_e32 v2, s17, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v2, v5, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, v2, v6, vcc ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v3 ; GFX9-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s4, v1 -; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v6, vcc +; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s4, v1 +; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v10, vcc ; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GFX9-NEXT: v_trunc_f32_e32 v4, v3 ; GFX9-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4 ; GFX9-NEXT: v_add_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v2 -; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v7 -; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v8, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v4 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[3:4] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v10 +; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v2 +; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v7 +; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v8, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v13, 0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v16, v4 +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v12 +; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v4, v15, v2 -; GFX9-NEXT: v_mul_lo_u32 v17, v12, v3 -; GFX9-NEXT: v_mul_hi_u32 v6, v12, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, v15, v2 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v17 -; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v16, v[4:5] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v13, v[3:4] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v17, v17, v5, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v4, v16, v2 +; GFX9-NEXT: v_mul_lo_u32 v5, v13, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v10, v6, vcc +; GFX9-NEXT: v_mul_hi_u32 v10, v13, v2 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v15, v3 -; GFX9-NEXT: v_add_u32_e32 v4, v17, v4 -; GFX9-NEXT: v_mul_hi_u32 v17, v12, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, v15, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v17 -; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GFX9-NEXT: v_add_u32_e32 v6, v6, v17 -; GFX9-NEXT: v_add_co_u32_e32 v17, vcc, 1, v13 -; GFX9-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v14, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s4, v10 -; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v10, v16, v3 +; GFX9-NEXT: v_mul_hi_u32 v2, v16, v2 +; GFX9-NEXT: v_add_u32_e32 v4, v5, v4 +; GFX9-NEXT: v_mul_hi_u32 v5, v13, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, v16, v3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v5, v10, v5 +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 1, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v15, vcc +; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s4, v11 +; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v2 -; GFX9-NEXT: v_add3_u32 v3, v6, v4, v3 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v12, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v3, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v13, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v13, v2 +; GFX9-NEXT: v_add3_u32 v3, v5, v4, v3 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v13, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, v16, v3, vcc +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 ; GFX9-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v15, v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v13, v14, v18, vcc +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v16, v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v10, v14, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s3, v13, v[5:6] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s3, v12, v[5:6] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v3, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v13, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v7, v15, v4 -; GFX9-NEXT: v_mul_lo_u32 v8, v12, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v19, vcc -; GFX9-NEXT: v_mul_hi_u32 v10, v12, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v20, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v14, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v7, v16, v4 +; GFX9-NEXT: v_mul_lo_u32 v8, v13, v5 +; GFX9-NEXT: v_mul_hi_u32 v10, v13, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v11, v19, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, v12, v20, vcc ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v10, v15, v5 -; GFX9-NEXT: v_mul_hi_u32 v4, v15, v4 +; GFX9-NEXT: v_mul_lo_u32 v10, v16, v5 +; GFX9-NEXT: v_mul_hi_u32 v4, v16, v4 ; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 -; GFX9-NEXT: v_mul_hi_u32 v8, v12, v5 -; GFX9-NEXT: v_mul_hi_u32 v5, v15, v5 +; GFX9-NEXT: v_mul_hi_u32 v8, v13, v5 +; GFX9-NEXT: v_mul_hi_u32 v5, v16, v5 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 @@ -1452,8 +1489,8 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add_u32_e32 v8, v10, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v5, v8, v7, v5 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v13, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v16, v5, vcc ; GFX9-NEXT: v_mul_lo_u32 v7, s19, v4 ; GFX9-NEXT: v_mul_lo_u32 v8, s18, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v1, v6, s[0:1] @@ -1477,9 +1514,10 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v0, v9, s[0:1] ; GFX9-NEXT: v_add_u32_e32 v0, v10, v8 ; GFX9-NEXT: v_add3_u32 v8, v0, v1, v12 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: v_mov_b32_e32 v9, s19 ; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v8, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v9, s19 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s7, v11, v[0:1] ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s18, v4 @@ -1526,6 +1564,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x20 +; GFX10-NEXT: ; meta instruction ; GFX10-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s5 @@ -1546,120 +1585,127 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GFX10-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 ; GFX10-NEXT: v_mul_f32_e32 v3, 0x2f800000, v1 -; GFX10-NEXT: v_trunc_f32_e32 v4, v2 -; GFX10-NEXT: v_trunc_f32_e32 v5, v3 -; GFX10-NEXT: v_mul_f32_e32 v2, 0xcf800000, v4 -; GFX10-NEXT: v_mul_f32_e32 v3, 0xcf800000, v5 -; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v4 -; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v5 +; GFX10-NEXT: v_trunc_f32_e32 v6, v2 +; GFX10-NEXT: v_trunc_f32_e32 v8, v3 +; GFX10-NEXT: v_mul_f32_e32 v2, 0xcf800000, v6 +; GFX10-NEXT: v_mul_f32_e32 v3, 0xcf800000, v8 +; GFX10-NEXT: v_cvt_u32_f32_e32 v11, v6 +; GFX10-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GFX10-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX10-NEXT: v_add_f32_e32 v1, v3, v1 -; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v0 -; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v1 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s1, v7, 0 -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, s3, v8, 0 -; GFX10-NEXT: v_mul_hi_u32 v11, v9, v0 -; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s1, v9, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s3, v10, v[3:4] -; GFX10-NEXT: v_mul_lo_u32 v6, v9, v0 -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, s2, v7, v[4:5] -; GFX10-NEXT: v_mul_hi_u32 v4, v7, v0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s10, v8, v[5:6] -; GFX10-NEXT: v_mul_lo_u32 v1, v10, v2 -; GFX10-NEXT: v_mul_hi_u32 v5, v8, v2 -; GFX10-NEXT: v_mul_hi_u32 v2, v10, v2 -; GFX10-NEXT: v_mul_lo_u32 v12, v7, v3 -; GFX10-NEXT: v_mul_lo_u32 v13, v9, v3 -; GFX10-NEXT: v_mul_hi_u32 v14, v7, v3 -; GFX10-NEXT: v_mul_lo_u32 v15, v8, v0 -; GFX10-NEXT: v_mul_lo_u32 v16, v10, v0 -; GFX10-NEXT: v_mul_hi_u32 v17, v8, v0 -; GFX10-NEXT: v_mul_hi_u32 v3, v9, v3 -; GFX10-NEXT: v_mul_hi_u32 v0, v10, v0 -; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v12 +; GFX10-NEXT: v_cvt_u32_f32_e32 v9, v0 +; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v1 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s1, v9, 0 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, s3, v10, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, s1, v11, v[4:5] +; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s3, v8, v[6:7] +; GFX10-NEXT: v_mul_lo_u32 v7, v11, v0 +; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s2, v9, v[3:4] +; GFX10-NEXT: v_mul_hi_u32 v3, v9, v0 +; GFX10-NEXT: v_mul_hi_u32 v6, v11, v0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s10, v10, v[4:5] +; GFX10-NEXT: v_mul_lo_u32 v12, v9, v5 +; GFX10-NEXT: v_mul_lo_u32 v13, v11, v5 +; GFX10-NEXT: v_mul_lo_u32 v1, v8, v2 +; GFX10-NEXT: v_mul_hi_u32 v4, v10, v2 +; GFX10-NEXT: v_mul_hi_u32 v2, v8, v2 +; GFX10-NEXT: v_mul_hi_u32 v14, v9, v5 +; GFX10-NEXT: v_mul_lo_u32 v15, v10, v0 +; GFX10-NEXT: v_mul_lo_u32 v16, v8, v0 +; GFX10-NEXT: v_add_co_u32 v7, s0, v7, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v11, s0, v13, v11 +; GFX10-NEXT: v_add_co_u32 v6, s0, v13, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v15 ; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v2, s0, v16, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v4, s0, v6, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v6, s0, v11, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v5 +; GFX10-NEXT: v_add_co_u32 v3, s0, v7, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v14 +; GFX10-NEXT: v_mul_hi_u32 v17, v10, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v12, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v12, v3 +; GFX10-NEXT: v_mul_hi_u32 v5, v11, v5 +; GFX10-NEXT: v_mul_hi_u32 v0, v8, v0 ; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v17 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v15, v1 -; GFX10-NEXT: v_add_co_u32 v4, s0, v6, v4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v3, s0, v6, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v11, v13, v11 ; GFX10-NEXT: v_add_co_u32 v1, s0, v2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v16, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v7, v13, v7 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v16, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 -; GFX10-NEXT: v_add3_u32 v3, v11, v6, v3 -; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v7, v4 -; GFX10-NEXT: v_add_co_u32 v8, s0, v8, v1 -; GFX10-NEXT: v_add3_u32 v2, v5, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v3, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s11, s1, v7, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v10, vcc_lo, v10, v2, s0 -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, s3, v8, 0 -; GFX10-NEXT: v_mul_hi_u32 v11, v9, v0 -; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s1, v9, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s3, v10, v[3:4] -; GFX10-NEXT: v_mul_lo_u32 v6, v9, v0 -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, s2, v7, v[4:5] -; GFX10-NEXT: v_mul_hi_u32 v4, v7, v0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s10, v8, v[5:6] -; GFX10-NEXT: v_mul_lo_u32 v1, v10, v2 -; GFX10-NEXT: v_mul_hi_u32 v5, v8, v2 -; GFX10-NEXT: v_mul_hi_u32 v2, v10, v2 -; GFX10-NEXT: v_mul_lo_u32 v12, v7, v3 -; GFX10-NEXT: v_mul_lo_u32 v13, v9, v3 -; GFX10-NEXT: v_mul_hi_u32 v14, v7, v3 -; GFX10-NEXT: v_mul_lo_u32 v15, v8, v0 -; GFX10-NEXT: v_mul_lo_u32 v16, v10, v0 -; GFX10-NEXT: v_mul_hi_u32 v17, v8, v0 -; GFX10-NEXT: v_mul_hi_u32 v3, v9, v3 -; GFX10-NEXT: v_mul_hi_u32 v0, v10, v0 -; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v12 +; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v9, v3 +; GFX10-NEXT: v_add3_u32 v5, v7, v6, v5 +; GFX10-NEXT: v_add_co_u32 v10, s0, v10, v1 +; GFX10-NEXT: v_add3_u32 v2, v4, v2, v0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s11, s1, v9, 0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v11, v5, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e64 v8, vcc_lo, v8, v2, s0 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, s3, v10, 0 +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, s1, v11, v[4:5] +; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s3, v8, v[6:7] +; GFX10-NEXT: v_mul_lo_u32 v7, v11, v0 +; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s2, v9, v[3:4] +; GFX10-NEXT: v_mul_hi_u32 v3, v9, v0 +; GFX10-NEXT: v_mul_hi_u32 v6, v11, v0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s10, v10, v[4:5] +; GFX10-NEXT: v_mul_lo_u32 v12, v9, v5 +; GFX10-NEXT: v_mul_lo_u32 v13, v11, v5 +; GFX10-NEXT: v_mul_lo_u32 v1, v8, v2 +; GFX10-NEXT: v_mul_hi_u32 v4, v10, v2 +; GFX10-NEXT: v_mul_hi_u32 v2, v8, v2 +; GFX10-NEXT: v_mul_hi_u32 v14, v9, v5 +; GFX10-NEXT: v_mul_lo_u32 v15, v10, v0 +; GFX10-NEXT: v_mul_lo_u32 v16, v8, v0 +; GFX10-NEXT: v_add_co_u32 v7, s0, v7, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v11, s0, v13, v11 +; GFX10-NEXT: v_add_co_u32 v6, s0, v13, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v15 ; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v2, s0, v16, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v4, s0, v6, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v6, s0, v11, v14 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v5 +; GFX10-NEXT: v_add_co_u32 v3, s0, v7, v3 +; GFX10-NEXT: v_mul_hi_u32 v17, v10, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v14 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v12, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v12, v3 ; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v17 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 +; GFX10-NEXT: v_mul_hi_u32 v5, v11, v5 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v15, v1 -; GFX10-NEXT: v_add_co_u32 v4, s0, v6, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v11, v13, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v3, s0, v6, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v7, v13, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX10-NEXT: v_mul_hi_u32 v0, v8, v0 ; GFX10-NEXT: v_add_co_u32 v1, s0, v2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v16, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v16, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 -; GFX10-NEXT: v_add3_u32 v3, v11, v6, v3 -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v7, v4 -; GFX10-NEXT: v_add_co_u32 v1, s0, v8, v1 -; GFX10-NEXT: v_add3_u32 v0, v5, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v9, v3, vcc_lo -; GFX10-NEXT: v_mul_lo_u32 v3, s17, v4 -; GFX10-NEXT: v_mul_hi_u32 v5, s16, v4 -; GFX10-NEXT: v_add_co_ci_u32_e64 v0, vcc_lo, v10, v0, s0 +; GFX10-NEXT: v_add3_u32 v5, v7, v6, v5 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v9, v3 +; GFX10-NEXT: v_add_co_u32 v1, s0, v10, v1 +; GFX10-NEXT: v_add3_u32 v0, v4, v2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v11, v5, vcc_lo +; GFX10-NEXT: v_mul_lo_u32 v4, s17, v3 +; GFX10-NEXT: v_mul_hi_u32 v5, s16, v3 +; GFX10-NEXT: v_add_co_ci_u32_e64 v0, vcc_lo, v8, v0, s0 ; GFX10-NEXT: v_mul_lo_u32 v8, s16, v2 -; GFX10-NEXT: v_mul_hi_u32 v4, s17, v4 +; GFX10-NEXT: v_mul_hi_u32 v3, s17, v3 ; GFX10-NEXT: v_mul_lo_u32 v9, s17, v2 ; GFX10-NEXT: v_mul_lo_u32 v6, s19, v1 ; GFX10-NEXT: v_mul_hi_u32 v10, s16, v2 @@ -1670,9 +1716,9 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_lo_u32 v12, s19, v0 ; GFX10-NEXT: v_mul_hi_u32 v13, s18, v0 ; GFX10-NEXT: v_mul_hi_u32 v14, s19, v0 -; GFX10-NEXT: v_add_co_u32 v0, s0, v3, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v4, s0, v9, v4 +; GFX10-NEXT: v_add_co_u32 v0, s0, v4, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v3, s0, v9, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v2, s0, v6, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 @@ -1680,101 +1726,105 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v10 +; GFX10-NEXT: v_add_co_u32 v3, s0, v3, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v3, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v4, v0 ; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v6, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v8, v5 -; GFX10-NEXT: v_add_co_u32 v8, s0, v4, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v6, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v7, v8, v5 +; GFX10-NEXT: v_add_co_u32 v8, s0, v3, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v10, s0, v1, v2 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s4, v8, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s0 ; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, s6, v10, 0 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v9, v7 -; GFX10-NEXT: v_add3_u32 v9, v5, v4, v11 +; GFX10-NEXT: v_add_nc_u32_e32 v9, v9, v4 +; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX10-NEXT: v_add3_u32 v11, v7, v6, v11 +; GFX10-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX10-NEXT: v_mov_b32_e32 v13, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: v_add3_u32 v9, v9, v12, v14 +; GFX10-NEXT: v_mov_b32_e32 v6, v3 ; GFX10-NEXT: v_add_co_u32 v12, vcc_lo, v8, 1 -; GFX10-NEXT: v_mov_b32_e32 v11, 0 -; GFX10-NEXT: v_add3_u32 v7, v7, v6, v14 -; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s4, v9, v[1:2] -; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v9, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s6, v7, v[3:4] -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, s5, v8, v[4:5] -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v12, 1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v13, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, s16, v0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s7, v10, v[5:6] -; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s0, s17, v3, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s4, v14 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, s17, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, -1, s0 -; GFX10-NEXT: v_sub_co_u32 v15, s0, s18, v2 +; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, s4, v11, v[4:5] +; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s6, v9, v[6:7] +; GFX10-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, 0, v11, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s5, v8, v[3:4] +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v12, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v14, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v7, vcc_lo, s16, v0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s7, v10, v[4:5] +; GFX10-NEXT: v_sub_co_ci_u32_e64 v15, s0, s17, v5, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s4, v7 +; GFX10-NEXT: v_sub_nc_u32_e32 v1, s17, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, -1, s0 +; GFX10-NEXT: v_sub_co_u32 v16, s0, s18, v2 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo -; GFX10-NEXT: v_sub_co_ci_u32_e64 v16, s1, s19, v0, s0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v15 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v17, s1, s19, v0, s0 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v16 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, s19, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v17, vcc_lo, v14, s4 +; GFX10-NEXT: v_sub_co_u32 v5, vcc_lo, v7, s4 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, s1, 0, v1, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s5, v5 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s5, v15 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v23, s0, s7, v0, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, v18 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, -1, s1 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s4, v17 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s4, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, -1, s1 ; GFX10-NEXT: v_cmp_le_u32_e64 s1, s5, v18 ; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, -1, s1 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s7, v16 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s7, v17 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v21, v20, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, v5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, v15 ; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, -1, s1 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0 -; GFX10-NEXT: v_sub_co_u32 v0, s0, v17, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v19, v4, s0 +; GFX10-NEXT: v_sub_co_u32 v0, s0, v5, s4 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v19, s0, 0, v1, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v6, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v6, s1, v15, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v17, v0, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v12, s2, 0, v23, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v12, v3, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v14, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v0, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v5, s1, v16, s6 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v6, s2, 0, v23, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v1, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v3, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v11, v3, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v18, v19, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v16 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v14, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v3, s0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v17 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v7, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v22, v2, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v12 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc_lo -; GFX10-NEXT: v_add_co_u32 v13, vcc_lo, v10, 1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, 0, v7, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo -; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v13, 1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, 0, v14, vcc_lo +; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v10, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, 0, v9, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc_lo +; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v11, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, 0, v12, vcc_lo ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, vcc_lo, s7, v23, s1 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_sub_co_u32 v8, s1, v6, s6 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v7 +; GFX10-NEXT: v_sub_co_u32 v7, s1, v5, s6 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, s1, 0, v18, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v13, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v13, v14, v17, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v11, v12, v14, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v8, v12, v18, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v9, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v13, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v15, v6, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v16, v8, s1 -; GFX10-NEXT: global_store_dwordx4 v11, v[0:3], s[12:13] -; GFX10-NEXT: global_store_dwordx4 v11, v[4:7], s[14:15] +; GFX10-NEXT: v_cndmask_b32_e32 v7, v5, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v12, v6, v18, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v5, v15, v3, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v8, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v9, v11, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v16, v7, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v17, v12, s1 +; GFX10-NEXT: global_store_dwordx4 v13, v[0:3], s[12:13] +; GFX10-NEXT: global_store_dwordx4 v13, v[4:7], s[14:15] ; GFX10-NEXT: s_endpgm %div = udiv <2 x i64> %x, %y store <2 x i64> %div, ptr addrspace(1) %out0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll index 1fd139b06417f..4a19d7e99d621 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -633,10 +633,12 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v2, 8, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-TRUE16-NEXT: v_pk_sub_u16 v0, v0, v1 clamp ; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v1, 8, v6 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_sub_u16 v1, v2, v1 clamp +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h ; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 @@ -878,6 +880,8 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_sub_u16 v1, s0, s1 clamp +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h ; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 46b82d3a3d651..9c75967f2d624 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -226198,8 +226198,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 ; GFX11-TRUE16-NEXT: v_add3_u32 v37, v39, v48, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v49, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v22.h ; GFX11-TRUE16-NEXT: v_dual_add_f32 v71, 0x40c00000, v71 :: v_dual_add_f32 v4, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v37, v37, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v38, v39, v49, 0x7fff @@ -226207,22 +226207,21 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v50, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v49, 0x40c00000, v52 :: v_dual_lshlrev_b32 v52, 16, v25 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v34, 16, v21 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v24, v38, v39 :: v_dual_and_b32 v5, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v80, 0x40c00000, v80 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v38, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v38, v48, v50, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v50 ; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v51, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 ; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v49, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v80, 0x40c00000, v80 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v52, 0x40c00000, v52 :: v_dual_lshlrev_b32 v81, 16, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v38, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v39, v48, v51, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v51 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v51, 0x40c00000, v53 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v81, 0x40c00000, v81 :: v_dual_add_f32 v6, 0x40c00000, v6 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v25, v39, v48, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v39, v50, v49, 0x7fff @@ -226230,86 +226229,82 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v52, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v52 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v81, 0x40c00000, v81 :: v_dual_add_f32 v6, 0x40c00000, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v84, 0x40c00000, v84 :: v_dual_lshlrev_b32 v85, 16, v10 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v39, v48, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v48, v50, v52, 0x7fff ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v26 ; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v51, 16, 1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v84, 0x40c00000, v84 :: v_dual_add_f32 v9, 0x40c00000, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v48, v48, v49, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v53, 0x40c00000, v53 :: v_dual_add_f32 v54, 0x40c00000, v26 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 ; GFX11-TRUE16-NEXT: v_add3_u32 v49, v50, v51, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v51 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v54, 0x40c00000, v26 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v27 ; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v53, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v53 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 16, v10 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v49, v49, v50, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v50, v52, v53, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v54, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v26 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v53, 0x40c00000, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v85, 0x40c00000, v85 :: v_dual_cndmask_b32 v26, v50, v51 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v85, 0x40c00000, v85 :: v_dual_add_f32 v10, 0x40c00000, v10 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v50, v51, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v50, v52, v54, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v54 ; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v55, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v54, 0x40c00000, v64 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v28 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_add_f32 v11, 0x40c00000, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v50, v51, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v50, v52, v55, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v55 ; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v53, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v55, 0x40c00000, v64 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v27.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v28, v50, v51 :: v_dual_and_b32 v13, 0xffff0000, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 0x40c00000, v13 :: v_dual_cndmask_b32 v28, v50, v51 ; GFX11-TRUE16-NEXT: v_add3_u32 v50, v52, v53, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v53 ; GFX11-TRUE16-NEXT: v_bfe_u32 v52, v54, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 ; GFX11-TRUE16-NEXT: v_bfe_u32 v53, v55, 16, 1 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v33, 16, v26 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v37.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v50, v50, v51, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v51, v52, v54, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v54 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v29 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v51, v52, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v29, v51, v52 :: v_dual_lshlrev_b32 v64, 16, v29 ; GFX11-TRUE16-NEXT: v_add3_u32 v51, v53, v55, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v55 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v54, 0x40c00000, v65 :: v_dual_lshlrev_b32 v65, 16, v30 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v64, 0x40c00000, v64 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v50.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v51, v51, v52, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v55, v54, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v65, 0x40c00000, v65 ; GFX11-TRUE16-NEXT: v_bfe_u32 v53, v64, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v28 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v52, v53, v64, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v64 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v64, 0x40c00000, v66 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 16, v31 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v30, v52, v53 :: v_dual_and_b32 v31, 0xffff0000, v31 ; GFX11-TRUE16-NEXT: v_add3_u32 v52, v55, v54, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v54 @@ -226349,21 +226344,18 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v67, 0x40c00000, v69 ; GFX11-TRUE16-NEXT: v_bfe_u32 v65, v66, 16, 1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 16, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v31.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v55, v64, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v66 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v55, v65, v66, 0x7fff ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 ; GFX11-TRUE16-NEXT: v_bfe_u32 v66, v67, 16, 1 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v69, 0x40c00000, v69 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v55, v55, v64, vcc_lo ; GFX11-TRUE16-NEXT: v_add_f32_e32 v68, 0x40c00000, v68 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v55.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v65, v68, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v1 ; GFX11-TRUE16-NEXT: v_add3_u32 v64, v65, v68, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v65, 0x400000, v68 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v68, 0x40c00000, v70 @@ -226395,12 +226387,10 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70 ; GFX11-TRUE16-NEXT: v_bfe_u32 v70, v71, 16, 1 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v67, v67, v68, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add3_u32 v68, v69, v3, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 16, v67 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v68, v69, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v68, v70, v71, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, 0x400000, v71 @@ -226412,28 +226402,26 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add3_u32 v69, v70, v4, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v70, 0x400000, v4 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 16, v68 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v69, v70, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v69, v71, v80, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v70, 0x400000, v80 ; GFX11-TRUE16-NEXT: v_bfe_u32 v71, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80 ; GFX11-TRUE16-NEXT: v_bfe_u32 v80, v81, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v69, v69, v70, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v70, v71, v5, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 16, v69 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v70, v71, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v70, v80, v81, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v71, 0x400000, v81 ; GFX11-TRUE16-NEXT: v_bfe_u32 v80, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81 ; GFX11-TRUE16-NEXT: v_bfe_u32 v81, v82, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v5.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v70, v70, v71, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v71, v80, v6, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v80, 0x400000, v6 @@ -226481,14 +226469,12 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfe_u32 v84, v10, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85 ; GFX11-TRUE16-NEXT: v_bfe_u32 v85, v86, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v9.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v81 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v82, v83, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v83, v84, v10, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v10 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v7, 16, v9 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v83, v84, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v83, v85, v86, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v86 @@ -226496,72 +226482,50 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v86, 0x40c00000, v96 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v96, 0x400000, v11 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v85, v85, v11, 0x7fff ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v83, v83, v84, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v84, v87, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 ; GFX11-TRUE16-NEXT: v_bfe_u32 v99, v86, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v82 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v84, v84, v87, 0x7fff ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v85, v96, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v87, v99, v86, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v96, 0x400000, v86 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v6, 16, v10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v11.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v84, v97, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v83 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v86, v87, v96, vcc_lo ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 16, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v5, 16, v11 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v70 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_add_f32 v96, 0x40c00000, v96 :: v_dual_add_f32 v15, 0x40c00000, v15 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v5, 16, v6 ; GFX11-TRUE16-NEXT: v_bfe_u32 v101, v96, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v102, v15, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v113, 0x400000, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v114, 0x400000, v96 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v17, 16, v69 ; GFX11-TRUE16-NEXT: v_add3_u32 v101, v101, v96, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v102, v102, v15, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v66.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 16, v0 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v27, 16, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v51.h ; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v29 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v31, 16, v66 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v98, v12, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v97, 0x400000, v12 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v27, 16, v51 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v32, 16, v50 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v38.h ; GFX11-TRUE16-NEXT: v_add3_u32 v85, v98, v12, 0x7fff ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v98, 16, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v87, 0x40c00000, v98 ; GFX11-TRUE16-NEXT: v_bfe_u32 v98, v13, 16, 1 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v32, 16, v38 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v34.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 ; GFX11-TRUE16-NEXT: v_bfe_u32 v99, v87, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v98, v98, v13, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v103, 0x400000, v87 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18 ; GFX11-TRUE16-NEXT: v_add3_u32 v99, v99, v87, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; GFX11-TRUE16-NEXT: v_bfe_u32 v100, v14, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v112, 0x400000, v14 @@ -226570,73 +226534,140 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add3_u32 v100, v100, v14, 0x7fff ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v100, v112, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v14.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v102, v113, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v96, v96 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.l, v15.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v96, v101, v114, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v96 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v99, v103, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v99, 0x400000, v13 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v3, 16, v15 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, v14.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v87 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v100, 16, v15 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v98, v99, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v4, 16, v14 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v13.h +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v96, 16, v14 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v85, v97, vcc_lo +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v13.h ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v86 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v12.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v12.h ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v84 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v3, 16, v13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v11.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v83 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v85, 16, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v10.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v82 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v86, 16, v12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v9.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v81 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v84, 16, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v8.h ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v80 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v4, 16, v12 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v83, 16, v10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v7.h ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v71 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v3, 16, v8 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v4, 16, v7 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v22, 16, v68 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v64.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v82, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v6.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v70 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v81, 16, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v5.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v69 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v80, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v4.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v68 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v71, 16, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v3.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v67 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v70, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v69, 16, v4 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v66.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v55.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v68, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v64.h ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v2 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v17, 16, v65 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v54.h -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v3, 16, v67 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v22, 16, v64 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v52.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v31.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v69, 16, v55 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v67, 16, v65 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v54.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v66, 16, v64 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v52.h ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 16, v30 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v17, 16, v53 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v49.h -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v22, 16, v52 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v50.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 16, v28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v51.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 16, v29 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v29, v54, 16, v52 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v27.h +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v65, 16, v50 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v30, v55, 16, v53 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v64, 16, v51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v49.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v37.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v39.h ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 16, v25 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v17, 16, v48 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v36.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v32, 16, v34 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v22, 16, v39 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v33, 16, v37 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v32.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v17.h -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v17, 16, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v33, 16, v36 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v37, 16, v38 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v39, 16, v16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v38.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v25, v50, 16, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v22.h +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v22, v52, 16, v37 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v49, 16, v39 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v23, v51, 16, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v36.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v31, v70, 16, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v34.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v33.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v32.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v17.h +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v26, v66, 16, v26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v53, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v37, 16, v35 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v36, 16, v34 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v38, 16, v33 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v39, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v48, 16, v16 ; GFX11-TRUE16-NEXT: .LBB104_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll index 9b28fd9e7b6fd..9533a8eb8a9bb 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll @@ -18251,7 +18251,6 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add3_u32 v12, v12, v7, 0x7fff ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; GFX11-TRUE16-NEXT: v_add3_u32 v8, v11, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v4 @@ -18266,8 +18265,7 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v7, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v9, v12, v2, 0x7fff ; GFX11-TRUE16-NEXT: v_add3_u32 v11, v13, v4, 0x7fff @@ -18281,24 +18279,29 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_add3_u32 v13, v14, v3, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v13, v14, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v9, v15, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.h ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.h -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v0, 16, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v1, 16, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v7, 16, v5 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v4, 16, v6 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v8, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v4, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v7, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v6, 16, v0 ; GFX11-TRUE16-NEXT: .LBB94_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll index 52e125d0d658f..b0664de7e6370 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll @@ -28849,21 +28849,20 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v1, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_dual_add_f32 v12, 0x40c00000, v12 :: v_dual_add_f32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v1, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v0, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v0 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v13, v13, v0, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v13, v15, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v1 @@ -28873,8 +28872,7 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v10, 16, 1 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v11, v13, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v10 @@ -28885,13 +28883,12 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v4 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, v11, v13, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v11, v14, v2, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v2 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_add3_u32 v14, v15, v12, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v11, v13, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 @@ -28949,33 +28946,45 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v23, v24, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v19, v25, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v18 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v6.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v16, v20, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v7.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v18 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v6.h ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v14 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v0, 16, v7 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v5.h ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v11 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v15, v17, vcc_lo -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v1, 16, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v12 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v2, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v16, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v18, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v14, 16, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v11 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v0.h -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v1, 16, v3 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v4 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v10 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v11, 16, v9 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v12, 16, v8 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v3.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v12 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v2.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v13, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v1.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v11, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v12, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v10, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v9, 16, v0 ; GFX11-TRUE16-NEXT: .LBB94_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll index 632b03ca51b81..683e3dc15f133 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll @@ -5455,12 +5455,13 @@ define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX11-TRUE16-NEXT: .LBB50_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll index fd190b23dd8ca..4c137069aae8c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll @@ -824,17 +824,18 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v1 ; GFX11-TRUE16-NEXT: .LBB4_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index ede44e738fe00..ac2b77a24b09d 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -60959,49 +60959,42 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v20, v22, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v22 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v19, v20, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v20, v21, v3, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v20, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v20, v21 :: v_dual_and_b32 v4, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 0x40c00000, v23 :: v_dual_add_f32 v4, 0x40c00000, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v23, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v23 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add3_u32 v20, v22, v23, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v4, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v20, v21, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add3_u32 v21, v22, v4, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v4 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v21, v22 :: v_dual_and_b32 v5, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_add_f32 v24, 0x40c00000, v24 :: v_dual_add_f32 v5, 0x40c00000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v24, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v24 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add3_u32 v21, v23, v24, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v5, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v21, v22, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add3_u32 v22, v23, v5, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v22, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v22, v23 :: v_dual_and_b32 v6, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 0x40c00000, v25 :: v_dual_add_f32 v6, 0x40c00000, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v25, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v25 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 @@ -61070,45 +61063,38 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, 0x400000, v10 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 ; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v10, v27, v28 :: v_dual_and_b32 v11, 0xffff0000, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_add_f32 v30, 0x40c00000, v30 :: v_dual_add_f32 v11, 0x40c00000, v11 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v29, v30, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, 0x400000, v30 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add3_u32 v27, v29, v30, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v29, v11, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v6, 16, v10 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v27, v28, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add3_u32 v28, v29, v11, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v11 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 ; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v11, v28, v29 :: v_dual_and_b32 v12, 0xffff0000, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_add_f32 v31, 0x40c00000, v31 :: v_dual_add_f32 v12, 0x40c00000, v12 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v11.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v30, v31, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v28, v12, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v27 ; GFX11-TRUE16-NEXT: v_add3_u32 v29, v30, v31, 0x7fff ; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, 0x400000, v31 ; GFX11-TRUE16-NEXT: v_add3_u32 v28, v28, v12, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v13, 16, 1 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v5, 16, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v30, 16, 1 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v29, v32, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v31, v31, v13, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v22 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v14 @@ -61119,74 +61105,100 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v13, v31, v35 :: v_dual_add_f32 v32, 0x40c00000, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v30 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v15, 0x40c00000, v15 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v13.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v32, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v12.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v15, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v15 ; GFX11-TRUE16-NEXT: v_add3_u32 v35, v36, v32, 0x7fff ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v29 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v31, v31, v15, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v5, 16, v6 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v16, 16, v21 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v4, 16, v12 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.h ; GFX11-TRUE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v23 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v14, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v14 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v4, 16, v7 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v21, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v36, v37, v14, 0x7fff ; GFX11-TRUE16-NEXT: v_add3_u32 v37, v38, v34, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v37, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v34.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v31, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, 0x400000, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v36, v48, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v15 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v14.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v34.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v35, v31, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v36, 16, v15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v14.h ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v31 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v28, v33, vcc_lo -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v1, 16, v14 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v13.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v30, 16, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v28 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v9.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v12.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v29 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v11.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v27 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v31, 16, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v10.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v28, 16, v12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v9.h ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v25 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v3, 16, v13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v29, 16, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v8.h ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v24 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v1, 16, v9 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v27, 16, v10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v23 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v26, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v25, 16, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v5.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v21 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v24, 16, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v4.h ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v3, 16, v8 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v16, 16, v17 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v1, 16, v4 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v18, 16, v20 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v3, 16, v19 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v23, 16, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v3.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v22, 16, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v18.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v21, 16, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v1.h +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v20, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v19, 16, v17 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v18, 16, v16 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v22, 16, v0 ; GFX11-TRUE16-NEXT: .LBB94_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll index ab1f8606cffd7..f9e0157e2dc4e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll @@ -12846,22 +12846,25 @@ define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v0, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v2, 16, v3 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v4, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v5, 16, v0 ; GFX11-TRUE16-NEXT: .LBB94_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll index 9f5c9c4c509ed..d6e89167e68e6 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll @@ -8437,7 +8437,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v58, v11 @@ -8460,8 +8460,8 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v34, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 @@ -8484,7 +8484,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -8507,7 +8507,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_or_b32_e32 v1, v34, v1 ; SI-NEXT: v_or_b32_e32 v3, v62, v3 ; SI-NEXT: v_or_b32_e32 v4, v36, v4 @@ -8516,7 +8516,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v7, v48, v7 ; SI-NEXT: v_or_b32_e32 v8, v38, v8 ; SI-NEXT: v_or_b32_e32 v9, v49, v9 -; SI-NEXT: v_or_b32_e32 v10, v50, v10 +; SI-NEXT: v_or_b32_e32 v10, v51, v10 ; SI-NEXT: v_or_b32_e32 v11, v59, v11 ; SI-NEXT: v_or_b32_e32 v12, v57, v12 ; SI-NEXT: v_or_b32_e32 v13, v47, v13 @@ -8531,8 +8531,8 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 @@ -8561,7 +8561,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 @@ -8739,7 +8739,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -8757,14 +8757,14 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v56, v43 ; SI-NEXT: v_mov_b32_e32 v43, v54 ; SI-NEXT: v_mov_b32_e32 v54, v24 +; SI-NEXT: v_mov_b32_e32 v32, v50 ; SI-NEXT: v_mov_b32_e32 v50, v34 ; SI-NEXT: v_mov_b32_e32 v34, v62 ; SI-NEXT: v_mov_b32_e32 v62, v57 ; SI-NEXT: v_mov_b32_e32 v57, v44 ; SI-NEXT: v_mov_b32_e32 v44, v55 ; SI-NEXT: v_mov_b32_e32 v55, v25 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v63 +; SI-NEXT: v_mov_b32_e32 v51, v63 ; SI-NEXT: v_mov_b32_e32 v63, v58 ; SI-NEXT: v_mov_b32_e32 v58, v45 ; SI-NEXT: v_mov_b32_e32 v45, v40 @@ -8784,14 +8784,14 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v40, v45 ; SI-NEXT: v_mov_b32_e32 v45, v58 ; SI-NEXT: v_mov_b32_e32 v58, v63 -; SI-NEXT: v_mov_b32_e32 v63, v33 -; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v63, v51 ; SI-NEXT: v_mov_b32_e32 v25, v55 ; SI-NEXT: v_mov_b32_e32 v55, v44 ; SI-NEXT: v_mov_b32_e32 v44, v57 ; SI-NEXT: v_mov_b32_e32 v57, v62 ; SI-NEXT: v_mov_b32_e32 v62, v34 ; SI-NEXT: v_mov_b32_e32 v34, v50 +; SI-NEXT: v_mov_b32_e32 v50, v32 ; SI-NEXT: v_mov_b32_e32 v24, v54 ; SI-NEXT: v_mov_b32_e32 v54, v43 ; SI-NEXT: v_mov_b32_e32 v43, v56 @@ -8808,7 +8808,7 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v46, v59 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -16907,7 +16907,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v58, v11 @@ -16930,8 +16930,8 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v34, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 @@ -16954,7 +16954,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -16977,7 +16977,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_or_b32_e32 v1, v34, v1 ; SI-NEXT: v_or_b32_e32 v3, v62, v3 ; SI-NEXT: v_or_b32_e32 v4, v36, v4 @@ -16986,7 +16986,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v7, v48, v7 ; SI-NEXT: v_or_b32_e32 v8, v38, v8 ; SI-NEXT: v_or_b32_e32 v9, v49, v9 -; SI-NEXT: v_or_b32_e32 v10, v50, v10 +; SI-NEXT: v_or_b32_e32 v10, v51, v10 ; SI-NEXT: v_or_b32_e32 v11, v59, v11 ; SI-NEXT: v_or_b32_e32 v12, v57, v12 ; SI-NEXT: v_or_b32_e32 v13, v47, v13 @@ -17001,8 +17001,8 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 @@ -17031,7 +17031,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 @@ -17209,7 +17209,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -17227,14 +17227,14 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: v_mov_b32_e32 v56, v43 ; SI-NEXT: v_mov_b32_e32 v43, v54 ; SI-NEXT: v_mov_b32_e32 v54, v24 +; SI-NEXT: v_mov_b32_e32 v32, v50 ; SI-NEXT: v_mov_b32_e32 v50, v34 ; SI-NEXT: v_mov_b32_e32 v34, v62 ; SI-NEXT: v_mov_b32_e32 v62, v57 ; SI-NEXT: v_mov_b32_e32 v57, v44 ; SI-NEXT: v_mov_b32_e32 v44, v55 ; SI-NEXT: v_mov_b32_e32 v55, v25 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v63 +; SI-NEXT: v_mov_b32_e32 v51, v63 ; SI-NEXT: v_mov_b32_e32 v63, v58 ; SI-NEXT: v_mov_b32_e32 v58, v45 ; SI-NEXT: v_mov_b32_e32 v45, v40 @@ -17254,14 +17254,14 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: v_mov_b32_e32 v40, v45 ; SI-NEXT: v_mov_b32_e32 v45, v58 ; SI-NEXT: v_mov_b32_e32 v58, v63 -; SI-NEXT: v_mov_b32_e32 v63, v33 -; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v63, v51 ; SI-NEXT: v_mov_b32_e32 v25, v55 ; SI-NEXT: v_mov_b32_e32 v55, v44 ; SI-NEXT: v_mov_b32_e32 v44, v57 ; SI-NEXT: v_mov_b32_e32 v57, v62 ; SI-NEXT: v_mov_b32_e32 v62, v34 ; SI-NEXT: v_mov_b32_e32 v34, v50 +; SI-NEXT: v_mov_b32_e32 v50, v32 ; SI-NEXT: v_mov_b32_e32 v24, v54 ; SI-NEXT: v_mov_b32_e32 v54, v43 ; SI-NEXT: v_mov_b32_e32 v43, v56 @@ -17278,7 +17278,7 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a, ; SI-NEXT: v_mov_b32_e32 v46, v59 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -24693,7 +24693,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v58, v11 @@ -24716,8 +24716,8 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v34, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 @@ -24740,7 +24740,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -24763,7 +24763,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_or_b32_e32 v1, v34, v1 ; SI-NEXT: v_or_b32_e32 v3, v62, v3 ; SI-NEXT: v_or_b32_e32 v4, v36, v4 @@ -24772,7 +24772,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v7, v48, v7 ; SI-NEXT: v_or_b32_e32 v8, v38, v8 ; SI-NEXT: v_or_b32_e32 v9, v49, v9 -; SI-NEXT: v_or_b32_e32 v10, v50, v10 +; SI-NEXT: v_or_b32_e32 v10, v51, v10 ; SI-NEXT: v_or_b32_e32 v11, v59, v11 ; SI-NEXT: v_or_b32_e32 v12, v57, v12 ; SI-NEXT: v_or_b32_e32 v13, v47, v13 @@ -24787,8 +24787,8 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 @@ -24817,7 +24817,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 @@ -24995,7 +24995,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -25013,14 +25013,14 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v56, v43 ; SI-NEXT: v_mov_b32_e32 v43, v54 ; SI-NEXT: v_mov_b32_e32 v54, v24 +; SI-NEXT: v_mov_b32_e32 v32, v50 ; SI-NEXT: v_mov_b32_e32 v50, v34 ; SI-NEXT: v_mov_b32_e32 v34, v62 ; SI-NEXT: v_mov_b32_e32 v62, v57 ; SI-NEXT: v_mov_b32_e32 v57, v44 ; SI-NEXT: v_mov_b32_e32 v44, v55 ; SI-NEXT: v_mov_b32_e32 v55, v25 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v63 +; SI-NEXT: v_mov_b32_e32 v51, v63 ; SI-NEXT: v_mov_b32_e32 v63, v58 ; SI-NEXT: v_mov_b32_e32 v58, v45 ; SI-NEXT: v_mov_b32_e32 v45, v40 @@ -25040,14 +25040,14 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v40, v45 ; SI-NEXT: v_mov_b32_e32 v45, v58 ; SI-NEXT: v_mov_b32_e32 v58, v63 -; SI-NEXT: v_mov_b32_e32 v63, v33 -; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v63, v51 ; SI-NEXT: v_mov_b32_e32 v25, v55 ; SI-NEXT: v_mov_b32_e32 v55, v44 ; SI-NEXT: v_mov_b32_e32 v44, v57 ; SI-NEXT: v_mov_b32_e32 v57, v62 ; SI-NEXT: v_mov_b32_e32 v62, v34 ; SI-NEXT: v_mov_b32_e32 v34, v50 +; SI-NEXT: v_mov_b32_e32 v50, v32 ; SI-NEXT: v_mov_b32_e32 v24, v54 ; SI-NEXT: v_mov_b32_e32 v54, v43 ; SI-NEXT: v_mov_b32_e32 v43, v56 @@ -25064,7 +25064,7 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v46, v59 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -31528,7 +31528,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v58, v11 @@ -31551,8 +31551,8 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v51, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v1, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v34, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s21 @@ -31575,7 +31575,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -31598,7 +31598,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v22 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_or_b32_e32 v1, v34, v1 ; SI-NEXT: v_or_b32_e32 v3, v62, v3 ; SI-NEXT: v_or_b32_e32 v4, v36, v4 @@ -31607,7 +31607,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: v_or_b32_e32 v7, v48, v7 ; SI-NEXT: v_or_b32_e32 v8, v38, v8 ; SI-NEXT: v_or_b32_e32 v9, v49, v9 -; SI-NEXT: v_or_b32_e32 v10, v50, v10 +; SI-NEXT: v_or_b32_e32 v10, v51, v10 ; SI-NEXT: v_or_b32_e32 v11, v59, v11 ; SI-NEXT: v_or_b32_e32 v12, v57, v12 ; SI-NEXT: v_or_b32_e32 v13, v47, v13 @@ -31622,8 +31622,8 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v62 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 @@ -31652,7 +31652,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v51 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 @@ -31830,7 +31830,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -31848,14 +31848,14 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: v_mov_b32_e32 v56, v43 ; SI-NEXT: v_mov_b32_e32 v43, v54 ; SI-NEXT: v_mov_b32_e32 v54, v24 +; SI-NEXT: v_mov_b32_e32 v32, v50 ; SI-NEXT: v_mov_b32_e32 v50, v34 ; SI-NEXT: v_mov_b32_e32 v34, v62 ; SI-NEXT: v_mov_b32_e32 v62, v57 ; SI-NEXT: v_mov_b32_e32 v57, v44 ; SI-NEXT: v_mov_b32_e32 v44, v55 ; SI-NEXT: v_mov_b32_e32 v55, v25 -; SI-NEXT: v_mov_b32_e32 v32, v33 -; SI-NEXT: v_mov_b32_e32 v33, v63 +; SI-NEXT: v_mov_b32_e32 v51, v63 ; SI-NEXT: v_mov_b32_e32 v63, v58 ; SI-NEXT: v_mov_b32_e32 v58, v45 ; SI-NEXT: v_mov_b32_e32 v45, v40 @@ -31875,14 +31875,14 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: v_mov_b32_e32 v40, v45 ; SI-NEXT: v_mov_b32_e32 v45, v58 ; SI-NEXT: v_mov_b32_e32 v58, v63 -; SI-NEXT: v_mov_b32_e32 v63, v33 -; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v63, v51 ; SI-NEXT: v_mov_b32_e32 v25, v55 ; SI-NEXT: v_mov_b32_e32 v55, v44 ; SI-NEXT: v_mov_b32_e32 v44, v57 ; SI-NEXT: v_mov_b32_e32 v57, v62 ; SI-NEXT: v_mov_b32_e32 v62, v34 ; SI-NEXT: v_mov_b32_e32 v34, v50 +; SI-NEXT: v_mov_b32_e32 v50, v32 ; SI-NEXT: v_mov_b32_e32 v24, v54 ; SI-NEXT: v_mov_b32_e32 v54, v43 ; SI-NEXT: v_mov_b32_e32 v43, v56 @@ -31899,7 +31899,7 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a ; SI-NEXT: v_mov_b32_e32 v46, v59 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll index 4f46875076809..e8e20f75964a1 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll @@ -11977,7 +11977,6 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: v_mov_b32_e32 v61, v40 -; SI-NEXT: v_mov_b32_e32 v40, v44 ; SI-NEXT: s_cbranch_vccnz .LBB19_5 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(5) @@ -12012,7 +12011,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 ; SI-NEXT: v_mov_b32_e32 v55, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 @@ -23964,7 +23963,6 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: v_mov_b32_e32 v61, v40 -; SI-NEXT: v_mov_b32_e32 v40, v44 ; SI-NEXT: s_cbranch_vccnz .LBB35_5 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(5) @@ -23999,7 +23997,7 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 ; SI-NEXT: v_mov_b32_e32 v55, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 @@ -35116,7 +35114,6 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: v_mov_b32_e32 v61, v40 -; SI-NEXT: v_mov_b32_e32 v40, v44 ; SI-NEXT: s_cbranch_vccnz .LBB47_5 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(5) @@ -35151,7 +35148,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 ; SI-NEXT: v_mov_b32_e32 v55, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 @@ -45101,7 +45098,6 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: v_mov_b32_e32 v61, v40 -; SI-NEXT: v_mov_b32_e32 v40, v44 ; SI-NEXT: s_cbranch_vccnz .LBB55_5 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(5) @@ -45136,7 +45132,7 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 ; SI-NEXT: v_mov_b32_e32 v55, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v43 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll index 4aded5da3668a..5770e58992e09 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll @@ -12409,14 +12409,14 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9 ; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v6 ; GFX11-TRUE16-NEXT: v_add3_u32 v6, v10, v9, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v1, 16, 1 ; GFX11-TRUE16-NEXT: v_add3_u32 v9, v11, v3, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v5, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v10, v1, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 @@ -12426,26 +12426,30 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v9, v10 :: v_dual_add_f32 v0, 0x40c00000, v0 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v13, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v0, 16, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v1, 16, v3 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v5, 16, v4 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v5, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v6, 16, v0 ; GFX11-TRUE16-NEXT: .LBB52_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 97df2a0dbd44b..91cb00bffb04b 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -2026,7 +2026,9 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v2, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s7, v2, v[1:2] +; GFX9-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s7, v2, v[3:4] ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_nop 2 @@ -2068,8 +2070,10 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v2, s[2:3] -; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s7, v2, v[1:2] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s7, v2, v[3:4] ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -2109,9 +2113,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s6, v2, s[2:3] ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s2, s7, v2, v[1:2] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s2, s7, v2, v[3:4] ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm @@ -2151,12 +2157,15 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3] ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2] -; GFX1164-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s5, v2, v[3:4] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mov_b32_e32 v1, v5 ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm ; @@ -2194,12 +2203,15 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3] ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2] -; GFX1132-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s5, v2, v[3:4] +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_mov_b32_e32 v1, v5 ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1132-NEXT: s_endpgm ; @@ -2236,11 +2248,13 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1264-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s4, v2, s[2:3] ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-NEXT: s_mov_b32 s2, -1 -; GFX1264-NEXT: v_mad_co_u64_u32 v[1:2], null, s5, v2, v[1:2] +; GFX1264-NEXT: v_mov_b32_e32 v3, v1 +; GFX1264-NEXT: v_mad_co_u64_u32 v[1:2], null, s5, v2, v[3:4] ; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null ; GFX1264-NEXT: s_endpgm ; @@ -2276,11 +2290,13 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1232-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, s4, v2, s[2:3] ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-NEXT: s_mov_b32 s2, -1 -; GFX1232-NEXT: v_mad_co_u64_u32 v[1:2], null, s5, v2, v[1:2] +; GFX1232-NEXT: v_mov_b32_e32 v3, v1 +; GFX1232-NEXT: v_mad_co_u64_u32 v[1:2], null, s5, v2, v[3:4] ; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null ; GFX1232-NEXT: s_endpgm entry: @@ -6224,13 +6240,14 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s8, v4, 0 -; GFX9-NEXT: v_readfirstlane_b32 s7, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v1 +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s9, v4, v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s7, v2 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v0, vcc ; GFX9-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 @@ -6288,7 +6305,9 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1064-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mad_u64_u32 v[2:3], s[2:3], s8, v4, 0 -; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[2:3], s9, v4, v[3:4] +; GFX1064-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GFX1064-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[2:3], s9, v4, v[5:6] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v2 @@ -6349,8 +6368,10 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s10 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mad_u64_u32 v[2:3], s2, s8, v4, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s2, s9, v4, v[3:4] +; GFX1032-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s2, s9, v4, v[5:6] ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v2 ; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v3, vcc_lo @@ -6413,17 +6434,19 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: ; %bb.3: ; %Flow ; GFX1164-NEXT: s_or_b64 exec, exec, s[12:13] ; GFX1164-NEXT: .LBB10_4: ; %Flow4 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX1164-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mad_u64_u32 v[2:3], null, s8, v4, 0 +; GFX1164-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1164-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s9, v4, v[3:4] +; GFX1164-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v2 ; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v5, vcc +; GFX1164-NEXT: v_mad_u64_u32 v[7:8], null, s9, v4, v[5:6] +; GFX1164-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v7, vcc ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm @@ -6483,13 +6506,15 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s10 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_mad_u64_u32 v[2:3], null, s8, v4, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s9, v4, v[3:4] -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1132-NEXT: v_mov_b32_e32 v5, v3 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v2 ; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v5, vcc_lo +; GFX1132-NEXT: v_mad_u64_u32 v[7:8], null, s9, v4, v[5:6] +; GFX1132-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v7, vcc_lo ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1132-NEXT: s_endpgm @@ -6526,12 +6551,15 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_mad_co_u64_u32 v[3:4], null, s4, v2, 0 +; GFX1264-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1264-NEXT: v_mad_co_u64_u32 v[4:5], null, s5, v2, v[4:5] +; GFX1264-NEXT: v_mov_b32_e32 v5, v4 ; GFX1264-NEXT: v_sub_co_u32 v0, vcc, s2, v3 ; GFX1264-NEXT: s_mov_b32 s2, -1 +; GFX1264-NEXT: v_mad_co_u64_u32 v[4:5], null, s5, v2, v[5:6] +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v4, vcc ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null @@ -6568,12 +6596,15 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_mad_co_u64_u32 v[3:4], null, s4, v2, 0 +; GFX1232-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1232-NEXT: v_mad_co_u64_u32 v[4:5], null, s5, v2, v[4:5] +; GFX1232-NEXT: v_mov_b32_e32 v5, v4 ; GFX1232-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 ; GFX1232-NEXT: s_mov_b32 s2, -1 +; GFX1232-NEXT: v_mad_co_u64_u32 v[4:5], null, s5, v2, v[5:6] +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v4, vcc_lo ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 23c5f4f5506f3..4638f641ffbc5 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -1794,9 +1794,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1] +; GFX9-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[3:4] ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -1830,9 +1832,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 +; GFX1064-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5] -; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[3:4] ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1865,9 +1869,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 +; GFX1032-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5] -; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s2, s3, v2, v[3:4] ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1901,13 +1907,15 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s4, v0 +; GFX1164-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] ; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[3:4] ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-NEXT: v_mov_b32_e32 v1, v5 ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm ; @@ -1938,13 +1946,15 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s4, v0 +; GFX1132-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] ; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] +; GFX1132-NEXT: v_mov_b32_e32 v3, v1 +; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[3:4] ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-NEXT: v_mov_b32_e32 v1, v5 ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1132-NEXT: s_endpgm entry: @@ -5212,11 +5222,12 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[8:9], s2, v2, 0 +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_readfirstlane_b32 s8, v0 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v2, v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s8, v3 @@ -5254,8 +5265,10 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 +; GFX1064-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 -; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] +; GFX1064-NEXT: v_mov_b32_e32 v5, v4 +; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[5:6] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v3 @@ -5291,8 +5304,10 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s2, s2, v2, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 -; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5] +; GFX1032-NEXT: v_mov_b32_e32 v5, v4 +; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s2, s3, v2, v[5:6] ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 @@ -5329,14 +5344,16 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 +; GFX1164-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s4, v1 -; GFX1164-NEXT: s_waitcnt_depctr 0xfff -; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164-NEXT: v_mov_b32_e32 v5, v4 ; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: v_sub_co_ci_u32_e64 v1, null, s4, v5, vcc +; GFX1164-NEXT: v_mad_u64_u32 v[7:8], null, s3, v2, v[5:6] +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: v_sub_co_ci_u32_e64 v1, null, s4, v7, vcc ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm ; @@ -5367,14 +5384,16 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive) ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5] +; GFX1132-NEXT: v_mov_b32_e32 v5, v4 ; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: v_sub_co_ci_u32_e64 v1, null, s4, v5, vcc_lo +; GFX1132-NEXT: v_mad_u64_u32 v[7:8], null, s3, v2, v[5:6] +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: v_sub_co_ci_u32_e64 v1, null, s4, v7, vcc_lo ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1132-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 44c719f3635c8..4db3233aa79f0 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -10053,10 +10053,11 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -14733,10 +14734,11 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -15545,10 +15547,11 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -20539,10 +20542,11 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -25020,10 +25024,11 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -34766,17 +34771,19 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) { ; ; GFX11TRUE16-LABEL: v_sitofp_i16_to_bf16: ; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_bfe_i32 v0, v1, 0, 16 ; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -35085,9 +35092,10 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h @@ -35701,27 +35709,28 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX11TRUE16-NEXT: v_cvt_f32_i32_e32 v3, v1 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX11TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff -; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff +; GFX11TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11FAKE16-LABEL: v_sitofp_v3i32_to_v3bf16: @@ -36853,9 +36862,10 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.h ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h @@ -37901,10 +37911,11 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -38524,27 +38535,28 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v2, v2 -; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v3, v1 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX11TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff -; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff +; GFX11TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11FAKE16-LABEL: v_uitofp_v3i32_to_v3bf16: @@ -39472,22 +39484,24 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_ldexp_f32 v2, v2, v5 ; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX11TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff ; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff -; GFX11TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo +; GFX11TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.h +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -46402,10 +46416,11 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v0, v7, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11FAKE16-LABEL: v_fma_v3bf16: @@ -47350,10 +47365,11 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v0, v7, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11FAKE16-LABEL: v_fmuladd_v3bf16: diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll index 53b2542cf9a7e..3c76bb613d21f 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll @@ -2381,18 +2381,20 @@ define <12 x i8> @load_v12i8(ptr addrspace(8) inreg %buf) { ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0 +; SDAG-NEXT: ; implicit-def: $vgpr11_vgpr12 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v8, v2 -; SDAG-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; SDAG-NEXT: v_mov_b32_e32 v11, v2 ; SDAG-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] ; SDAG-NEXT: v_lshrrev_b32_e32 v14, 8, v0 ; SDAG-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; SDAG-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; SDAG-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] ; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; SDAG-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; SDAG-NEXT: v_lshrrev_b32_e32 v9, 8, v2 ; SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; SDAG-NEXT: v_mov_b32_e32 v4, v1 +; SDAG-NEXT: v_mov_b32_e32 v8, v2 ; SDAG-NEXT: v_mov_b32_e32 v1, v14 ; SDAG-NEXT: v_mov_b32_e32 v2, v13 ; SDAG-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll index 861621bd92af1..fa5b94fe6cf28 100644 --- a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll +++ b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll @@ -48,10 +48,11 @@ define void @undef_lo_v2i16(i16 %arg0) { ; ; GFX11-TRUE16-LABEL: undef_lo_v2i16: ; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11-TRUE16-NEXT: ;;#ASMSTART -; GFX11-TRUE16-NEXT: ; use v0 +; GFX11-TRUE16-NEXT: ; use v1 ; GFX11-TRUE16-NEXT: ;;#ASMEND ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] %undef.lo = insertelement <2 x i16> poison, i16 %arg0, i32 1 @@ -99,10 +100,11 @@ define void @undef_lo_v2f16(half %arg0) { ; ; GFX11-TRUE16-LABEL: undef_lo_v2f16: ; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11-TRUE16-NEXT: ;;#ASMSTART -; GFX11-TRUE16-NEXT: ; use v0 +; GFX11-TRUE16-NEXT: ; use v1 ; GFX11-TRUE16-NEXT: ;;#ASMEND ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] %undef.lo = insertelement <2 x half> poison, half %arg0, i32 1 @@ -157,10 +159,11 @@ define void @undef_lo_op_v2f16(half %arg0) { ; ; GFX11-TRUE16-LABEL: undef_lo_op_v2f16: ; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, v1, 1.0 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: ;;#ASMSTART ; GFX11-TRUE16-NEXT: ; use v0 ; GFX11-TRUE16-NEXT: ;;#ASMEND @@ -240,10 +243,11 @@ define void @undef_lo_op_v2i16(i16 %arg0) { ; ; GFX11-TRUE16-SDAG-LABEL: undef_lo_op_v2i16: ; GFX11-TRUE16-SDAG: ; %bb.0: +; GFX11-TRUE16-SDAG-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-SDAG-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX11-TRUE16-SDAG-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11-TRUE16-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-SDAG-NEXT: v_pk_add_u16 v0, 0x63, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-SDAG-NEXT: v_pk_add_u16 v0, 0x63, v1 op_sel_hi:[0,1] ; GFX11-TRUE16-SDAG-NEXT: ;;#ASMSTART ; GFX11-TRUE16-SDAG-NEXT: ; use v0 ; GFX11-TRUE16-SDAG-NEXT: ;;#ASMEND @@ -251,10 +255,11 @@ define void @undef_lo_op_v2i16(i16 %arg0) { ; ; GFX11-TRUE16-GISEL-LABEL: undef_lo_op_v2i16: ; GFX11-TRUE16-GISEL: ; %bb.0: +; GFX11-TRUE16-GISEL-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-GISEL-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX11-TRUE16-GISEL-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11-TRUE16-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-GISEL-NEXT: v_pk_add_u16 v0, 0x630063, v0 +; GFX11-TRUE16-GISEL-NEXT: v_pk_add_u16 v0, 0x630063, v1 ; GFX11-TRUE16-GISEL-NEXT: ;;#ASMSTART ; GFX11-TRUE16-GISEL-NEXT: ; use v0 ; GFX11-TRUE16-GISEL-NEXT: ;;#ASMEND @@ -306,10 +311,11 @@ define void @undef_lo3_v4i16(i16 %arg0) { ; ; GFX11-TRUE16-LABEL: undef_lo3_v4i16: ; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11-TRUE16-NEXT: ;;#ASMSTART -; GFX11-TRUE16-NEXT: ; use v[0:1] +; GFX11-TRUE16-NEXT: ; use v[1:2] ; GFX11-TRUE16-NEXT: ;;#ASMEND ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] %undef.lo = insertelement <4 x i16> poison, i16 %arg0, i32 1 @@ -358,10 +364,11 @@ define void @undef_lo3_v4f16(half %arg0) { ; ; GFX11-TRUE16-LABEL: undef_lo3_v4f16: ; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11-TRUE16-NEXT: ;;#ASMSTART -; GFX11-TRUE16-NEXT: ; use v[0:1] +; GFX11-TRUE16-NEXT: ; use v[1:2] ; GFX11-TRUE16-NEXT: ;;#ASMEND ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] %undef.lo = insertelement <4 x half> poison, half %arg0, i32 1 @@ -412,6 +419,7 @@ define void @undef_lo2_v4i16(<2 x i16> %arg0) { ; ; GFX11-TRUE16-SDAG-LABEL: undef_lo2_v4i16: ; GFX11-TRUE16-SDAG: ; %bb.0: +; GFX11-TRUE16-SDAG-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-SDAG-NEXT: v_mov_b16_e32 v1.l, v0.h ; GFX11-TRUE16-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -515,13 +523,31 @@ define void @undef_hi_v2i16(i16 %arg0) { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: undef_hi_v2i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v0 -; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-LABEL: undef_hi_v2i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ; use v0 +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-SDAG-LABEL: undef_hi_v2i16: +; GFX11-TRUE16-SDAG: ; %bb.0: +; GFX11-TRUE16-SDAG-NEXT: ; implicit-def: $vgpr1 +; GFX11-TRUE16-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-SDAG-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-SDAG-NEXT: ;;#ASMSTART +; GFX11-TRUE16-SDAG-NEXT: ; use v1 +; GFX11-TRUE16-SDAG-NEXT: ;;#ASMEND +; GFX11-TRUE16-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-GISEL-LABEL: undef_hi_v2i16: +; GFX11-TRUE16-GISEL: ; %bb.0: +; GFX11-TRUE16-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-GISEL-NEXT: ;;#ASMSTART +; GFX11-TRUE16-GISEL-NEXT: ; use v0 +; GFX11-TRUE16-GISEL-NEXT: ;;#ASMEND +; GFX11-TRUE16-GISEL-NEXT: s_setpc_b64 s[30:31] %undef.hi = insertelement <2 x i16> poison, i16 %arg0, i32 0 call void asm sideeffect "; use $0", "v"(<2 x i16> %undef.hi); ret void @@ -553,13 +579,31 @@ define void @undef_hi_v2f16(half %arg0) { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: undef_hi_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v0 -; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-LABEL: undef_hi_v2f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ; use v0 +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-SDAG-LABEL: undef_hi_v2f16: +; GFX11-TRUE16-SDAG: ; %bb.0: +; GFX11-TRUE16-SDAG-NEXT: ; implicit-def: $vgpr1 +; GFX11-TRUE16-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-SDAG-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-SDAG-NEXT: ;;#ASMSTART +; GFX11-TRUE16-SDAG-NEXT: ; use v1 +; GFX11-TRUE16-SDAG-NEXT: ;;#ASMEND +; GFX11-TRUE16-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-GISEL-LABEL: undef_hi_v2f16: +; GFX11-TRUE16-GISEL: ; %bb.0: +; GFX11-TRUE16-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-GISEL-NEXT: ;;#ASMSTART +; GFX11-TRUE16-GISEL-NEXT: ; use v0 +; GFX11-TRUE16-GISEL-NEXT: ;;#ASMEND +; GFX11-TRUE16-GISEL-NEXT: s_setpc_b64 s[30:31] %undef.hi = insertelement <2 x half> poison, half %arg0, i32 0 call void asm sideeffect "; use $0", "v"(<2 x half> %undef.hi); ret void @@ -598,14 +642,35 @@ define void @undef_hi_op_v2f16(half %arg0) { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: undef_hi_op_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0] -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v0 -; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-LABEL: undef_hi_op_v2f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ; use v0 +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-SDAG-LABEL: undef_hi_op_v2f16: +; GFX11-TRUE16-SDAG: ; %bb.0: +; GFX11-TRUE16-SDAG-NEXT: ; implicit-def: $vgpr1 +; GFX11-TRUE16-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-SDAG-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-SDAG-NEXT: v_pk_add_f16 v0, v1, 1.0 op_sel_hi:[1,0] +; GFX11-TRUE16-SDAG-NEXT: ;;#ASMSTART +; GFX11-TRUE16-SDAG-NEXT: ; use v0 +; GFX11-TRUE16-SDAG-NEXT: ;;#ASMEND +; GFX11-TRUE16-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-GISEL-LABEL: undef_hi_op_v2f16: +; GFX11-TRUE16-GISEL: ; %bb.0: +; GFX11-TRUE16-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-GISEL-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; GFX11-TRUE16-GISEL-NEXT: ;;#ASMSTART +; GFX11-TRUE16-GISEL-NEXT: ; use v0 +; GFX11-TRUE16-GISEL-NEXT: ;;#ASMEND +; GFX11-TRUE16-GISEL-NEXT: s_setpc_b64 s[30:31] %undef.hi = insertelement <2 x half> poison, half %arg0, i32 0 %op = fadd <2 x half> %undef.hi, call void asm sideeffect "; use $0", "v"(<2 x half> %op); @@ -674,8 +739,11 @@ define void @undef_hi_op_v2i16(i16 %arg0) { ; ; GFX11-TRUE16-SDAG-LABEL: undef_hi_op_v2i16: ; GFX11-TRUE16-SDAG: ; %bb.0: +; GFX11-TRUE16-SDAG-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-SDAG-NEXT: v_pk_add_u16 v0, 0x63, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-SDAG-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-SDAG-NEXT: v_pk_add_u16 v0, 0x63, v1 op_sel_hi:[0,1] ; GFX11-TRUE16-SDAG-NEXT: ;;#ASMSTART ; GFX11-TRUE16-SDAG-NEXT: ; use v0 ; GFX11-TRUE16-SDAG-NEXT: ;;#ASMEND @@ -722,13 +790,31 @@ define void @undef_hi3_v4i16(i16 %arg0) { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: undef_hi3_v4i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v[0:1] -; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-LABEL: undef_hi3_v4i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ; use v[0:1] +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-SDAG-LABEL: undef_hi3_v4i16: +; GFX11-TRUE16-SDAG: ; %bb.0: +; GFX11-TRUE16-SDAG-NEXT: ; implicit-def: $vgpr1 +; GFX11-TRUE16-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-SDAG-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-SDAG-NEXT: ;;#ASMSTART +; GFX11-TRUE16-SDAG-NEXT: ; use v[1:2] +; GFX11-TRUE16-SDAG-NEXT: ;;#ASMEND +; GFX11-TRUE16-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-GISEL-LABEL: undef_hi3_v4i16: +; GFX11-TRUE16-GISEL: ; %bb.0: +; GFX11-TRUE16-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-GISEL-NEXT: ;;#ASMSTART +; GFX11-TRUE16-GISEL-NEXT: ; use v[0:1] +; GFX11-TRUE16-GISEL-NEXT: ;;#ASMEND +; GFX11-TRUE16-GISEL-NEXT: s_setpc_b64 s[30:31] %undef.hi = insertelement <4 x i16> poison, i16 %arg0, i32 0 call void asm sideeffect "; use $0", "v"(<4 x i16> %undef.hi); ret void @@ -761,13 +847,31 @@ define void @undef_hi3_v4f16(half %arg0) { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: undef_hi3_v4f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v[0:1] -; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-LABEL: undef_hi3_v4f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ; use v[0:1] +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-SDAG-LABEL: undef_hi3_v4f16: +; GFX11-TRUE16-SDAG: ; %bb.0: +; GFX11-TRUE16-SDAG-NEXT: ; implicit-def: $vgpr1 +; GFX11-TRUE16-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-SDAG-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-SDAG-NEXT: ;;#ASMSTART +; GFX11-TRUE16-SDAG-NEXT: ; use v[1:2] +; GFX11-TRUE16-SDAG-NEXT: ;;#ASMEND +; GFX11-TRUE16-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-GISEL-LABEL: undef_hi3_v4f16: +; GFX11-TRUE16-GISEL: ; %bb.0: +; GFX11-TRUE16-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-GISEL-NEXT: ;;#ASMSTART +; GFX11-TRUE16-GISEL-NEXT: ; use v[0:1] +; GFX11-TRUE16-GISEL-NEXT: ;;#ASMEND +; GFX11-TRUE16-GISEL-NEXT: s_setpc_b64 s[30:31] %undef.hi = insertelement <4 x half> poison, half %arg0, i32 0 call void asm sideeffect "; use $0", "v"(<4 x half> %undef.hi); ret void diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 26f204f29f5a4..b80e31c120c58 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -265,11 +265,20 @@ define float @v_uitofp_i8_to_f32(i8 %arg0) nounwind { ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_uitofp_i8_to_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_uitofp_i8_to_f32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte0_e32 v0, v1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_uitofp_i8_to_f32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cvt = uitofp i8 %arg0 to float ret float %cvt } @@ -301,6 +310,7 @@ define <2 x float> @v_uitofp_v2i8_to_v2f32(i16 %arg0) nounwind { ; ; GFX11-TRUE16-LABEL: v_uitofp_v2i8_to_v2f32: ; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll index 54cbc25043db3..0a848171ba032 100644 --- a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll +++ b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll @@ -27,9 +27,11 @@ define noundef i64 @srem64_3(i64 noundef %i) { ; GFX9-NEXT: v_add3_u32 v3, v6, v3, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v3, vcc ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, 3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, 3, v[3:4] +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, 3, v[4:5] ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -57,10 +59,10 @@ define noundef i64 @srem64_3(i64 noundef %i) { ; GFX942-NEXT: v_lshrrev_b32_e32 v2, 31, v5 ; GFX942-NEXT: v_lshl_add_u64 v[2:3], v[4:5], 0, v[2:3] ; GFX942-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, 3, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, 3, v[2:3] +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX942-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, 3, v[6:7] ; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -82,10 +84,12 @@ define noundef i64 @srem64_3(i64 noundef %i) { ; GFX1030-NEXT: v_add3_u32 v3, v4, v3, v5 ; GFX1030-NEXT: v_lshrrev_b32_e32 v4, 31, v3 ; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v3, vcc_lo +; GFX1030-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v3, vcc_lo +; GFX1030-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v2, 3, 0 -; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, v4, 3, v[3:4] +; GFX1030-NEXT: v_mov_b32_e32 v4, v3 ; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, v6, 3, v[4:5] ; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo ; GFX1030-NEXT: s_setpc_b64 s[30:31] entry: @@ -114,9 +118,11 @@ define noundef i64 @srem64_6(i64 noundef %i) { ; GFX9-NEXT: v_add3_u32 v3, v6, v3, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v3, vcc ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, 3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, 3, v[3:4] +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, 3, v[4:5] ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -144,10 +150,10 @@ define noundef i64 @srem64_6(i64 noundef %i) { ; GFX942-NEXT: v_lshrrev_b32_e32 v2, 31, v5 ; GFX942-NEXT: v_lshl_add_u64 v[2:3], v[4:5], 0, v[2:3] ; GFX942-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, 3, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, 3, v[2:3] +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX942-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, 3, v[6:7] ; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -169,10 +175,12 @@ define noundef i64 @srem64_6(i64 noundef %i) { ; GFX1030-NEXT: v_add3_u32 v3, v4, v3, v5 ; GFX1030-NEXT: v_lshrrev_b32_e32 v4, 31, v3 ; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v3, vcc_lo +; GFX1030-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v3, vcc_lo +; GFX1030-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v2, 3, 0 -; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, v4, 3, v[3:4] +; GFX1030-NEXT: v_mov_b32_e32 v4, v3 ; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, v6, 3, v[4:5] ; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo ; GFX1030-NEXT: s_setpc_b64 s[30:31] entry: @@ -188,6 +196,7 @@ define noundef i64 @urem64_3(i64 noundef %i) { ; GFX9-NEXT: v_mul_hi_u32 v2, v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b32 s6, 0xaaaaaaaa +; GFX9-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v4 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[2:3] @@ -196,9 +205,9 @@ define noundef i64 @urem64_3(i64 noundef %i) { ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3] ; GFX9-NEXT: v_alignbit_b32 v2, v3, v2, 1 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, 3, 0 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, 3, v[2:3] +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v3 +; GFX9-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, 3, v[6:7] ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -220,9 +229,10 @@ define noundef i64 @urem64_3(i64 noundef %i) { ; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v1, s2, v[2:3] ; GFX942-NEXT: v_alignbit_b32 v2, v3, v2, 1 ; GFX942-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, 3, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, 3, v[2:3] +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 1, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, 3, v[6:7] ; GFX942-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc @@ -233,6 +243,7 @@ define noundef i64 @urem64_3(i64 noundef %i) { ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1030-NEXT: v_mul_hi_u32 v2, 0xaaaaaaab, v0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 +; GFX1030-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, 0xaaaaaaab, v1, v[2:3] ; GFX1030-NEXT: v_mov_b32_e32 v2, v4 ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v0, v[2:3] @@ -240,11 +251,11 @@ define noundef i64 @urem64_3(i64 noundef %i) { ; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s4 ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v1, v[2:3] ; GFX1030-NEXT: v_alignbit_b32 v2, v3, v2, 1 -; GFX1030-NEXT: v_lshrrev_b32_e32 v3, 1, v3 ; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, v2, 3, 0 -; GFX1030-NEXT: v_mov_b32_e32 v2, v5 +; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 1, v3 +; GFX1030-NEXT: v_mov_b32_e32 v6, v5 ; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v3, 3, v[2:3] +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v2, 3, v[6:7] ; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v2, vcc_lo ; GFX1030-NEXT: s_setpc_b64 s[30:31] entry: @@ -260,6 +271,7 @@ define noundef i64 @urem64_6(i64 noundef %i) { ; GFX9-NEXT: v_mul_hi_u32 v2, v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b32 s6, 0xaaaaaaaa +; GFX9-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v4 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, s6, v[2:3] @@ -268,9 +280,9 @@ define noundef i64 @urem64_6(i64 noundef %i) { ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s6, v[2:3] ; GFX9-NEXT: v_alignbit_b32 v2, v3, v2, 2 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, 6, 0 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, 6, v[2:3] +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 2, v3 +; GFX9-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, 6, v[6:7] ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -292,9 +304,10 @@ define noundef i64 @urem64_6(i64 noundef %i) { ; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v1, s2, v[2:3] ; GFX942-NEXT: v_alignbit_b32 v2, v3, v2, 2 ; GFX942-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, 6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, 6, v[2:3] +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 2, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, 6, v[6:7] ; GFX942-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc @@ -305,6 +318,7 @@ define noundef i64 @urem64_6(i64 noundef %i) { ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1030-NEXT: v_mul_hi_u32 v2, 0xaaaaaaab, v0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 +; GFX1030-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, 0xaaaaaaab, v1, v[2:3] ; GFX1030-NEXT: v_mov_b32_e32 v2, v4 ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v0, v[2:3] @@ -312,11 +326,11 @@ define noundef i64 @urem64_6(i64 noundef %i) { ; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s4 ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xaaaaaaaa, v1, v[2:3] ; GFX1030-NEXT: v_alignbit_b32 v2, v3, v2, 2 -; GFX1030-NEXT: v_lshrrev_b32_e32 v3, 2, v3 ; GFX1030-NEXT: v_mad_u64_u32 v[4:5], null, v2, 6, 0 -; GFX1030-NEXT: v_mov_b32_e32 v2, v5 +; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 2, v3 +; GFX1030-NEXT: v_mov_b32_e32 v6, v5 ; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v3, 6, v[2:3] +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, v2, 6, v[6:7] ; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v2, vcc_lo ; GFX1030-NEXT: s_setpc_b64 s[30:31] entry: @@ -1035,9 +1049,11 @@ define noundef i64 @srem64_i32max(i64 noundef %i) { ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 31, v3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, s6, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, s6, v[3:4] +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v5, vcc +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, s6, v[4:5] ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1070,10 +1086,10 @@ define noundef i64 @srem64_i32max(i64 noundef %i) { ; GFX942-NEXT: v_lshl_add_u64 v[2:3], v[6:7], 0, v[4:5] ; GFX942-NEXT: s_brev_b32 s2, -2 ; GFX942-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, s2, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, s2, v[2:3] +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX942-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, s2, v[6:7] ; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -1102,10 +1118,12 @@ define noundef i64 @srem64_i32max(i64 noundef %i) { ; GFX1030-NEXT: v_ashrrev_i64 v[4:5], 30, v[2:3] ; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 31, v3 ; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v5, vcc_lo +; GFX1030-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v5, vcc_lo +; GFX1030-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0x7fffffff, v2, 0 -; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0x7fffffff, v4, v[3:4] +; GFX1030-NEXT: v_mov_b32_e32 v4, v3 ; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0x7fffffff, v6, v[4:5] ; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo ; GFX1030-NEXT: s_setpc_b64 s[30:31] entry: @@ -1221,11 +1239,13 @@ define noundef i64 @urem64_i32max(i64 noundef %i) { ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc ; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v5, v3, vcc -; GFX9-NEXT: v_alignbit_b32 v2, v4, v2, 30 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v5, v3, vcc +; GFX9-NEXT: v_alignbit_b32 v2, v6, v2, 30 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, s6, 0 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 30, v4 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, s6, v[3:4] +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 30, v6 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, s6, v[4:5] ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1251,9 +1271,10 @@ define noundef i64 @urem64_i32max(i64 noundef %i) { ; GFX942-NEXT: v_lshl_add_u64 v[2:3], v[4:5], 0, v[2:3] ; GFX942-NEXT: v_alignbit_b32 v2, v3, v2, 30 ; GFX942-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, s2, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 30, v3 -; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, s2, v[2:3] +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX942-NEXT: v_lshrrev_b32_e32 v2, 30, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, s2, v[6:7] ; GFX942-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc @@ -1274,12 +1295,14 @@ define noundef i64 @urem64_i32max(i64 noundef %i) { ; GFX1030-NEXT: v_sub_co_ci_u32_e64 v5, null, v1, v3, vcc_lo ; GFX1030-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] ; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v4, null, v5, v3, vcc_lo -; GFX1030-NEXT: v_alignbit_b32 v2, v4, v2, 30 -; GFX1030-NEXT: v_lshrrev_b32_e32 v4, 30, v4 +; GFX1030-NEXT: v_add_co_ci_u32_e64 v6, null, v5, v3, vcc_lo +; GFX1030-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1030-NEXT: v_alignbit_b32 v2, v6, v2, 30 ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0x7fffffff, v2, 0 -; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0x7fffffff, v4, v[3:4] +; GFX1030-NEXT: v_mov_b32_e32 v4, v3 +; GFX1030-NEXT: v_lshrrev_b32_e32 v3, 30, v6 ; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX1030-NEXT: v_mad_u64_u32 v[3:4], null, 0x7fffffff, v3, v[4:5] ; GFX1030-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo ; GFX1030-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index 6c8207a4b1396..98a2af36856f7 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -1159,8 +1159,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_subb_co_u32_e32 v9, vcc, v1, v16, vcc ; GFX9-G-NEXT: v_ashrrev_i32_e32 v17, 31, v7 ; GFX9-G-NEXT: v_xor_b32_e32 v3, v16, v3 -; GFX9-G-NEXT: v_subb_co_u32_e32 v10, vcc, v2, v16, vcc -; GFX9-G-NEXT: v_subb_co_u32_e32 v11, vcc, v3, v16, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v2, v16, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v13, vcc, v3, v16, vcc ; GFX9-G-NEXT: v_xor_b32_e32 v0, v17, v4 ; GFX9-G-NEXT: v_xor_b32_e32 v1, v17, v5 ; GFX9-G-NEXT: v_sub_co_u32_e32 v18, vcc, v0, v17 @@ -1172,8 +1172,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_or_b32_e32 v0, v18, v4 ; GFX9-G-NEXT: v_or_b32_e32 v1, v19, v5 ; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX9-G-NEXT: v_or_b32_e32 v0, v8, v10 -; GFX9-G-NEXT: v_or_b32_e32 v1, v9, v11 +; GFX9-G-NEXT: v_or_b32_e32 v0, v8, v12 +; GFX9-G-NEXT: v_or_b32_e32 v1, v9, v13 ; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v18 ; GFX9-G-NEXT: v_ffbh_u32_e32 v0, v19 @@ -1189,11 +1189,11 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[6:7] ; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v9 ; GFX9-G-NEXT: v_add_u32_e32 v2, 32, v2 -; GFX9-G-NEXT: v_ffbh_u32_e32 v3, v10 +; GFX9-G-NEXT: v_ffbh_u32_e32 v3, v12 ; GFX9-G-NEXT: v_min_u32_e32 v1, v1, v2 -; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v11 +; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v13 ; GFX9-G-NEXT: v_add_u32_e32 v3, 32, v3 -; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[12:13] ; GFX9-G-NEXT: v_add_u32_e32 v1, 64, v1 ; GFX9-G-NEXT: v_min_u32_e32 v2, v2, v3 ; GFX9-G-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[6:7] @@ -1220,8 +1220,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GFX9-G-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc ; GFX9-G-NEXT: v_cndmask_b32_e64 v7, v9, 0, vcc -; GFX9-G-NEXT: v_cndmask_b32_e64 v12, v10, 0, vcc -; GFX9-G-NEXT: v_cndmask_b32_e64 v13, v11, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v10, v12, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v11, v13, 0, vcc ; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] ; GFX9-G-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GFX9-G-NEXT: v_or_b32_e32 v14, v20, v14 @@ -1236,23 +1236,23 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v2, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v3, vcc ; GFX9-G-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GFX9-G-NEXT: v_sub_co_u32_e32 v12, vcc, 0x7f, v0 -; GFX9-G-NEXT: v_sub_u32_e32 v0, 64, v12 +; GFX9-G-NEXT: v_sub_co_u32_e32 v10, vcc, 0x7f, v0 +; GFX9-G-NEXT: v_sub_u32_e32 v0, 64, v10 ; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v0, v[8:9] -; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v12, v[10:11] -; GFX9-G-NEXT: v_add_u32_e32 v13, 0xffffffc0, v12 -; GFX9-G-NEXT: v_lshlrev_b64 v[6:7], v12, v[8:9] +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v10, v[12:13] +; GFX9-G-NEXT: v_add_u32_e32 v11, 0xffffffc0, v10 +; GFX9-G-NEXT: v_lshlrev_b64 v[6:7], v10, v[8:9] ; GFX9-G-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-G-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], v13, v[8:9] -; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 +; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], v11, v[8:9] +; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 ; GFX9-G-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 -; GFX9-G-NEXT: v_cndmask_b32_e32 v12, v0, v10, vcc -; GFX9-G-NEXT: v_cndmask_b32_e32 v13, v1, v11, vcc +; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 +; GFX9-G-NEXT: v_cndmask_b32_e32 v10, v0, v12, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v11, v1, v13, vcc ; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX9-G-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-G-NEXT: v_mov_b32_e32 v1, s9 @@ -1264,12 +1264,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: ; %bb.2: ; %udiv-preheader ; GFX9-G-NEXT: v_sub_u32_e32 v2, 64, v20 ; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v20, v[8:9] -; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v2, v[10:11] +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v2, v[12:13] ; GFX9-G-NEXT: v_add_u32_e32 v24, 0xffffffc0, v20 -; GFX9-G-NEXT: v_lshrrev_b64 v[14:15], v20, v[10:11] +; GFX9-G-NEXT: v_lshrrev_b64 v[14:15], v20, v[12:13] ; GFX9-G-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-G-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] +; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v24, v[12:13] ; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v20 ; GFX9-G-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-G-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc @@ -1279,8 +1279,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-G-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v20 ; GFX9-G-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v19, vcc -; GFX9-G-NEXT: v_cndmask_b32_e64 v10, v0, v8, s[4:5] -; GFX9-G-NEXT: v_cndmask_b32_e64 v11, v1, v9, s[4:5] +; GFX9-G-NEXT: v_cndmask_b32_e64 v12, v0, v8, s[4:5] +; GFX9-G-NEXT: v_cndmask_b32_e64 v13, v1, v9, s[4:5] ; GFX9-G-NEXT: v_addc_co_u32_e32 v26, vcc, -1, v4, vcc ; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] ; GFX9-G-NEXT: v_mov_b32_e32 v0, s8 @@ -1295,21 +1295,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_lshrrev_b32_e32 v8, 31, v7 ; GFX9-G-NEXT: v_or_b32_e32 v6, v0, v2 ; GFX9-G-NEXT: v_or_b32_e32 v7, v1, v3 -; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[10:11] -; GFX9-G-NEXT: v_lshrrev_b32_e32 v10, 31, v13 +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[12:13] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v12, 31, v11 ; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 1, v[14:15] -; GFX9-G-NEXT: v_or_b32_e32 v2, v2, v10 -; GFX9-G-NEXT: v_lshrrev_b32_e32 v14, 31, v11 -; GFX9-G-NEXT: v_sub_co_u32_e32 v10, vcc, v24, v2 +; GFX9-G-NEXT: v_or_b32_e32 v2, v2, v12 +; GFX9-G-NEXT: v_lshrrev_b32_e32 v14, 31, v13 +; GFX9-G-NEXT: v_sub_co_u32_e32 v12, vcc, v24, v2 ; GFX9-G-NEXT: v_or_b32_e32 v0, v0, v14 -; GFX9-G-NEXT: v_subb_co_u32_e32 v10, vcc, v25, v3, vcc -; GFX9-G-NEXT: v_subb_co_u32_e32 v10, vcc, v26, v0, vcc -; GFX9-G-NEXT: v_subb_co_u32_e32 v10, vcc, v27, v1, vcc -; GFX9-G-NEXT: v_ashrrev_i32_e32 v28, 31, v10 -; GFX9-G-NEXT: v_and_b32_e32 v10, v28, v18 -; GFX9-G-NEXT: v_and_b32_e32 v11, v28, v19 -; GFX9-G-NEXT: v_sub_co_u32_e32 v10, vcc, v2, v10 -; GFX9-G-NEXT: v_subb_co_u32_e32 v11, vcc, v3, v11, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v25, v3, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v26, v0, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v27, v1, vcc +; GFX9-G-NEXT: v_ashrrev_i32_e32 v28, 31, v12 +; GFX9-G-NEXT: v_and_b32_e32 v12, v28, v18 +; GFX9-G-NEXT: v_and_b32_e32 v13, v28, v19 +; GFX9-G-NEXT: v_sub_co_u32_e32 v12, vcc, v2, v12 +; GFX9-G-NEXT: v_subb_co_u32_e32 v13, vcc, v3, v13, vcc ; GFX9-G-NEXT: v_and_b32_e32 v2, v28, v4 ; GFX9-G-NEXT: v_and_b32_e32 v3, v28, v5 ; GFX9-G-NEXT: v_subb_co_u32_e32 v14, vcc, v0, v2, vcc @@ -1318,14 +1318,15 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v21, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v22, vcc, -1, v22, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v23, vcc -; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[12:13] +; GFX9-G-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11] ; GFX9-G-NEXT: v_or_b32_e32 v0, v20, v22 ; GFX9-G-NEXT: v_or_b32_e32 v1, v21, v23 ; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GFX9-G-NEXT: v_or_b32_e32 v12, v12, v8 +; GFX9-G-NEXT: v_or_b32_e32 v10, v10, v8 ; GFX9-G-NEXT: v_and_b32_e32 v8, 1, v28 -; GFX9-G-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-G-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-G-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-G-NEXT: v_mov_b32_e32 v0, v8 ; GFX9-G-NEXT: v_mov_b32_e32 v1, v9 ; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX9-G-NEXT: s_cbranch_execnz .LBB0_3 @@ -1334,9 +1335,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: .LBB0_5: ; %Flow2 ; GFX9-G-NEXT: s_or_b64 exec, exec, s[12:13] ; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[6:7] -; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[12:13] +; GFX9-G-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11] ; GFX9-G-NEXT: v_lshrrev_b32_e32 v4, 31, v7 -; GFX9-G-NEXT: v_or_b32_e32 v12, v12, v4 +; GFX9-G-NEXT: v_or_b32_e32 v10, v10, v4 ; GFX9-G-NEXT: v_or_b32_e32 v6, v0, v2 ; GFX9-G-NEXT: v_or_b32_e32 v7, v1, v3 ; GFX9-G-NEXT: .LBB0_6: ; %Flow3 @@ -1345,9 +1346,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_xor_b32_e32 v0, v6, v3 ; GFX9-G-NEXT: v_xor_b32_e32 v1, v7, v3 ; GFX9-G-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3 -; GFX9-G-NEXT: v_xor_b32_e32 v2, v12, v3 +; GFX9-G-NEXT: v_xor_b32_e32 v2, v10, v3 ; GFX9-G-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-G-NEXT: v_xor_b32_e32 v4, v13, v3 +; GFX9-G-NEXT: v_xor_b32_e32 v4, v11, v3 ; GFX9-G-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v3, vcc ; GFX9-G-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc ; GFX9-G-NEXT: s_setpc_b64 s[30:31] @@ -3391,14 +3392,15 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_addc_co_u32_e32 v19, vcc, -1, v19, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v20, vcc, -1, v20, vcc ; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v21, vcc +; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] ; GFX9-G-NEXT: v_or_b32_e32 v10, v18, v20 ; GFX9-G-NEXT: v_or_b32_e32 v11, v19, v21 -; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] ; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] ; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v0 ; GFX9-G-NEXT: v_and_b32_e32 v0, 1, v12 -; GFX9-G-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-G-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13 ; GFX9-G-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-G-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-G-NEXT: v_mov_b32_e32 v10, v0 ; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX9-G-NEXT: s_cbranch_execnz .LBB1_3 diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll index 5134159e3e406..82052f796daab 100644 --- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll @@ -600,6 +600,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GISEL-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GISEL-NEXT: v_or_b32_e32 v8, v8, v36 +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GISEL-NEXT: v_mov_b32_e32 v0, v22 ; GISEL-NEXT: v_mov_b32_e32 v1, v23 ; GISEL-NEXT: s_andn2_b64 exec, exec, s[8:9] @@ -627,33 +628,33 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_xor_b32_e32 v3, v18, v7 ; GISEL-NEXT: v_xor_b32_e32 v4, v19, v12 ; GISEL-NEXT: v_xor_b32_e32 v5, v19, v13 -; GISEL-NEXT: v_xor_b32_e32 v14, v19, v14 -; GISEL-NEXT: v_xor_b32_e32 v15, v19, v15 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v0, v18 -; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v1, v18, vcc +; GISEL-NEXT: v_xor_b32_e32 v6, v19, v14 +; GISEL-NEXT: v_xor_b32_e32 v7, v19, v15 +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v0, v18 +; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v1, v18, vcc ; GISEL-NEXT: v_sub_i32_e64 v20, s[4:5], v4, v19 ; GISEL-NEXT: v_subb_u32_e64 v21, s[4:5], v5, v19, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v12, vcc, v2, v18, vcc -; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v3, v18, vcc -; GISEL-NEXT: v_subb_u32_e64 v4, vcc, v14, v19, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v15, v19, vcc -; GISEL-NEXT: v_ffbh_u32_e32 v14, v21 -; GISEL-NEXT: v_ffbh_u32_e32 v15, v20 -; GISEL-NEXT: v_ffbh_u32_e32 v16, v7 -; GISEL-NEXT: v_ffbh_u32_e32 v17, v6 +; GISEL-NEXT: v_subb_u32_e32 v14, vcc, v2, v18, vcc +; GISEL-NEXT: v_subb_u32_e32 v15, vcc, v3, v18, vcc +; GISEL-NEXT: v_subb_u32_e64 v4, vcc, v6, v19, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v7, v19, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v6, v21 +; GISEL-NEXT: v_ffbh_u32_e32 v7, v20 +; GISEL-NEXT: v_ffbh_u32_e32 v16, v13 +; GISEL-NEXT: v_ffbh_u32_e32 v17, v12 ; GISEL-NEXT: v_or_b32_e32 v0, v20, v4 ; GISEL-NEXT: v_or_b32_e32 v1, v21, v5 -; GISEL-NEXT: v_or_b32_e32 v2, v6, v12 -; GISEL-NEXT: v_or_b32_e32 v3, v7, v13 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, 32, v15 +; GISEL-NEXT: v_or_b32_e32 v2, v12, v14 +; GISEL-NEXT: v_or_b32_e32 v3, v13, v15 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, 32, v7 ; GISEL-NEXT: v_ffbh_u32_e32 v26, v5 ; GISEL-NEXT: v_ffbh_u32_e32 v27, v4 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, 32, v17 -; GISEL-NEXT: v_ffbh_u32_e32 v28, v13 -; GISEL-NEXT: v_ffbh_u32_e32 v29, v12 +; GISEL-NEXT: v_ffbh_u32_e32 v28, v15 +; GISEL-NEXT: v_ffbh_u32_e32 v29, v14 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] -; GISEL-NEXT: v_min_u32_e32 v0, v14, v15 +; GISEL-NEXT: v_min_u32_e32 v0, v6, v7 ; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v27 ; GISEL-NEXT: v_min_u32_e32 v2, v16, v17 ; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v29 @@ -662,35 +663,35 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 64, v2 ; GISEL-NEXT: v_min_u32_e32 v3, v28, v3 ; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] ; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13] +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 ; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5] ; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5] ; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_xor_b32_e32 v10, 0x7f, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_xor_b32_e32 v6, 0x7f, v2 ; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v10, v10, v0 -; GISEL-NEXT: v_or_b32_e32 v11, v3, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v6, v6, v0 +; GISEL-NEXT: v_or_b32_e32 v7, v3, v1 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GISEL-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v11, v14, v15 -; GISEL-NEXT: v_and_b32_e32 v14, 1, v11 -; GISEL-NEXT: v_or_b32_e32 v10, v11, v10 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, v6, 0, vcc -; GISEL-NEXT: v_and_b32_e32 v16, 1, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v15, v7, 0, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v10, v12, 0, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v11, v13, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v7, v16, v10 +; GISEL-NEXT: v_and_b32_e32 v10, 1, v7 +; GISEL-NEXT: v_or_b32_e32 v7, v7, v6 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v6, v12, 0, vcc +; GISEL-NEXT: v_and_b32_e32 v16, 1, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, v13, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v10, v14, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v11, v15, 0, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] @@ -702,23 +703,23 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_not_b32_e32 v2, 63 ; GISEL-NEXT: v_addc_u32_e64 v28, vcc, 0, v0, s[4:5] ; GISEL-NEXT: v_addc_u32_e32 v29, vcc, 0, v1, vcc -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v30, v2 -; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 64, v30 -; GISEL-NEXT: v_lshl_b64 v[0:1], v[6:7], v30 -; GISEL-NEXT: v_lshl_b64 v[2:3], v[12:13], v30 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v30, v2 +; GISEL-NEXT: v_sub_i32_e64 v7, s[4:5], 64, v30 +; GISEL-NEXT: v_lshl_b64 v[0:1], v[12:13], v30 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[14:15], v30 ; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 -; GISEL-NEXT: v_lshr_b64 v[10:11], v[6:7], v10 -; GISEL-NEXT: v_lshl_b64 v[16:17], v[6:7], v14 +; GISEL-NEXT: v_lshr_b64 v[10:11], v[12:13], v7 +; GISEL-NEXT: v_lshl_b64 v[16:17], v[12:13], v6 ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v30 -; GISEL-NEXT: v_cndmask_b32_e32 v14, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v15, 0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc ; GISEL-NEXT: v_or_b32_e32 v0, v10, v2 ; GISEL-NEXT: v_or_b32_e32 v1, v11, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 -; GISEL-NEXT: v_cndmask_b32_e32 v10, v0, v12, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v11, v1, v13, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, v0, v14, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, v1, v15, vcc ; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] ; GISEL-NEXT: v_mov_b32_e32 v0, s8 ; GISEL-NEXT: v_mov_b32_e32 v1, s9 @@ -730,101 +731,102 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: ; %bb.8: ; %udiv-preheader ; GISEL-NEXT: v_add_i32_e32 v32, vcc, 0xffffffc0, v26 ; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 64, v26 -; GISEL-NEXT: v_lshr_b64 v[0:1], v[12:13], v26 -; GISEL-NEXT: v_lshr_b64 v[2:3], v[6:7], v26 +; GISEL-NEXT: v_lshr_b64 v[0:1], v[14:15], v26 +; GISEL-NEXT: v_lshr_b64 v[2:3], v[12:13], v26 ; GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GISEL-NEXT: v_add_i32_e32 v30, vcc, -1, v20 ; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v21, vcc -; GISEL-NEXT: v_lshl_b64 v[16:17], v[12:13], v16 -; GISEL-NEXT: v_lshr_b64 v[12:13], v[12:13], v32 +; GISEL-NEXT: v_lshl_b64 v[16:17], v[14:15], v16 +; GISEL-NEXT: v_lshr_b64 v[14:15], v[14:15], v32 ; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v4, vcc ; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v5, vcc ; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] ; GISEL-NEXT: v_or_b32_e32 v2, v2, v16 ; GISEL-NEXT: v_or_b32_e32 v3, v3, v17 ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v14, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v15, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v16, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v17, 0, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 -; GISEL-NEXT: v_cndmask_b32_e32 v12, v2, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v13, v3, v7, vcc -; GISEL-NEXT: v_mov_b32_e32 v7, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v2, v12, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v15, v3, v13, vcc +; GISEL-NEXT: v_mov_b32_e32 v13, 0 ; GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GISEL-NEXT: v_mov_b32_e32 v3, s7 ; GISEL-NEXT: .LBB0_9: ; %udiv-do-while ; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GISEL-NEXT: v_lshl_b64 v[2:3], v[12:13], 1 +; GISEL-NEXT: v_lshl_b64 v[34:35], v[14:15], 1 ; GISEL-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v6, 31, v13 -; GISEL-NEXT: v_lshrrev_b32_e32 v34, 31, v11 -; GISEL-NEXT: v_lshl_b64 v[12:13], v[14:15], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v2, 31, v15 +; GISEL-NEXT: v_or_b32_e32 v16, v16, v2 +; GISEL-NEXT: v_lshrrev_b32_e32 v2, 31, v11 +; GISEL-NEXT: v_or_b32_e32 v14, v34, v2 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[6:7], 1 ; GISEL-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v14, 31, v15 +; GISEL-NEXT: v_lshrrev_b32_e32 v6, 31, v7 +; GISEL-NEXT: v_or_b32_e32 v10, v10, v6 +; GISEL-NEXT: v_or_b32_e32 v6, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v7, v1, v3 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v30, v14 +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v31, v35, vcc +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v32, v16, vcc +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v33, v17, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v15, 31, v0 +; GISEL-NEXT: v_and_b32_e32 v12, 1, v15 +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_and_b32_e32 v2, v15, v20 +; GISEL-NEXT: v_and_b32_e32 v3, v15, v21 +; GISEL-NEXT: v_and_b32_e32 v12, v15, v4 +; GISEL-NEXT: v_and_b32_e32 v34, v15, v5 +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v14, v2 +; GISEL-NEXT: v_subb_u32_e32 v15, vcc, v35, v3, vcc +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v16, v12, vcc +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v17, v34, vcc ; GISEL-NEXT: v_add_i32_e32 v26, vcc, -1, v26 ; GISEL-NEXT: v_addc_u32_e32 v27, vcc, -1, v27, vcc -; GISEL-NEXT: v_or_b32_e32 v16, v16, v6 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v34 -; GISEL-NEXT: v_or_b32_e32 v10, v10, v14 -; GISEL-NEXT: v_or_b32_e32 v14, v0, v12 -; GISEL-NEXT: v_or_b32_e32 v15, v1, v13 ; GISEL-NEXT: v_addc_u32_e32 v28, vcc, -1, v28, vcc ; GISEL-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v30, v2 -; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v31, v3, vcc -; GISEL-NEXT: v_or_b32_e32 v0, v26, v28 -; GISEL-NEXT: v_or_b32_e32 v1, v27, v29 -; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v32, v16, vcc -; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v33, v17, vcc -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v6 +; GISEL-NEXT: v_or_b32_e32 v2, v26, v28 +; GISEL-NEXT: v_or_b32_e32 v3, v27, v29 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GISEL-NEXT: v_and_b32_e32 v6, 1, v0 -; GISEL-NEXT: v_and_b32_e32 v12, v0, v20 -; GISEL-NEXT: v_and_b32_e32 v13, v0, v21 -; GISEL-NEXT: v_and_b32_e32 v34, v0, v4 -; GISEL-NEXT: v_and_b32_e32 v35, v0, v5 -; GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GISEL-NEXT: v_mov_b32_e32 v1, v7 -; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v2, v12 -; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v3, v13, vcc -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v16, v34, vcc -; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v17, v35, vcc ; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GISEL-NEXT: s_cbranch_execnz .LBB0_9 ; GISEL-NEXT: ; %bb.10: ; %Flow ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GISEL-NEXT: .LBB0_11: ; %Flow11 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] -; GISEL-NEXT: v_lshl_b64 v[2:3], v[14:15], 1 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[6:7], 1 ; GISEL-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v4, 31, v15 +; GISEL-NEXT: v_lshrrev_b32_e32 v4, 31, v7 ; GISEL-NEXT: v_or_b32_e32 v10, v10, v4 -; GISEL-NEXT: v_or_b32_e32 v14, v0, v2 -; GISEL-NEXT: v_or_b32_e32 v15, v1, v3 +; GISEL-NEXT: v_or_b32_e32 v6, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v7, v1, v3 ; GISEL-NEXT: .LBB0_12: ; %Flow12 ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: v_xor_b32_e32 v3, v25, v24 -; GISEL-NEXT: v_xor_b32_e32 v7, v19, v18 +; GISEL-NEXT: v_xor_b32_e32 v12, v19, v18 ; GISEL-NEXT: v_xor_b32_e32 v0, v22, v3 ; GISEL-NEXT: v_xor_b32_e32 v1, v23, v3 ; GISEL-NEXT: v_xor_b32_e32 v2, v8, v3 -; GISEL-NEXT: v_xor_b32_e32 v6, v9, v3 -; GISEL-NEXT: v_xor_b32_e32 v4, v14, v7 -; GISEL-NEXT: v_xor_b32_e32 v5, v15, v7 -; GISEL-NEXT: v_xor_b32_e32 v8, v10, v7 -; GISEL-NEXT: v_xor_b32_e32 v9, v11, v7 +; GISEL-NEXT: v_xor_b32_e32 v8, v9, v3 +; GISEL-NEXT: v_xor_b32_e32 v4, v6, v12 +; GISEL-NEXT: v_xor_b32_e32 v5, v7, v12 +; GISEL-NEXT: v_xor_b32_e32 v6, v10, v12 +; GISEL-NEXT: v_xor_b32_e32 v7, v11, v12 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v4, v7 -; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v5, v7, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v4, v12 +; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v5, v12, s[4:5] ; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v3, vcc -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc -; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v8, v7, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v9, v7, vcc +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v8, v3, vcc +; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v12, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v7, v12, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] %shl = sdiv <2 x i128> %lhs, %rhs ret <2 x i128> %shl @@ -1356,6 +1358,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v16, v21, vcc ; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v17, v35, vcc ; GISEL-NEXT: v_or_b32_e32 v2, v2, v34 +; GISEL-NEXT: ; implicit-def: $vgpr18_vgpr19_vgpr20_vgpr21 ; GISEL-NEXT: v_mov_b32_e32 v19, v1 ; GISEL-NEXT: v_mov_b32_e32 v18, v0 ; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] @@ -1498,14 +1501,15 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_lshl_b64 v[22:23], v[6:7], 1 ; GISEL-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 ; GISEL-NEXT: v_lshrrev_b32_e32 v4, 31, v7 -; GISEL-NEXT: v_lshrrev_b32_e32 v30, 31, v1 +; GISEL-NEXT: v_lshrrev_b32_e32 v34, 31, v1 ; GISEL-NEXT: v_lshl_b64 v[6:7], v[9:10], 1 ; GISEL-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 ; GISEL-NEXT: v_lshrrev_b32_e32 v9, 31, v10 +; GISEL-NEXT: ; implicit-def: $vgpr30_vgpr31_vgpr32_vgpr33 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, -1, v8 ; GISEL-NEXT: v_addc_u32_e32 v11, vcc, -1, v11, vcc ; GISEL-NEXT: v_or_b32_e32 v16, v16, v4 -; GISEL-NEXT: v_or_b32_e32 v22, v22, v30 +; GISEL-NEXT: v_or_b32_e32 v22, v22, v34 ; GISEL-NEXT: v_or_b32_e32 v0, v0, v9 ; GISEL-NEXT: v_or_b32_e32 v9, v20, v6 ; GISEL-NEXT: v_or_b32_e32 v10, v21, v7 @@ -2023,6 +2027,8 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-LABEL: v_srem_v2i128_vv: ; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill ; GISEL-NEXT: v_ashrrev_i32_e32 v28, 31, v3 ; GISEL-NEXT: v_ashrrev_i32_e32 v18, 31, v11 ; GISEL-NEXT: v_mov_b32_e32 v19, 0x7f @@ -2198,6 +2204,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_subb_u32_e32 v24, vcc, v2, v48, vcc ; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v25, v49, vcc ; GISEL-NEXT: v_or_b32_e32 v18, v18, v39 +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GISEL-NEXT: v_mov_b32_e32 v0, v22 ; GISEL-NEXT: v_mov_b32_e32 v1, v23 ; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] @@ -2358,16 +2365,17 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 ; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v21 ; GISEL-NEXT: v_lshl_b64 v[52:53], v[24:25], 1 -; GISEL-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v24, 31, v25 -; GISEL-NEXT: v_lshrrev_b32_e32 v25, 31, v15 +; GISEL-NEXT: v_lshl_b64 v[54:55], v[26:27], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v40, 31, v25 +; GISEL-NEXT: v_lshrrev_b32_e32 v41, 31, v15 ; GISEL-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 +; GISEL-NEXT: ; implicit-def: $vgpr24_vgpr25_vgpr26_vgpr27 ; GISEL-NEXT: v_add_i32_e32 v36, vcc, -1, v36 ; GISEL-NEXT: v_addc_u32_e32 v37, vcc, -1, v37, vcc ; GISEL-NEXT: v_or_b32_e32 v20, v0, v2 ; GISEL-NEXT: v_or_b32_e32 v21, v1, v3 -; GISEL-NEXT: v_or_b32_e32 v2, v26, v24 -; GISEL-NEXT: v_or_b32_e32 v3, v52, v25 +; GISEL-NEXT: v_or_b32_e32 v2, v54, v40 +; GISEL-NEXT: v_or_b32_e32 v3, v52, v41 ; GISEL-NEXT: v_or_b32_e32 v14, v14, v22 ; GISEL-NEXT: v_addc_u32_e32 v38, vcc, -1, v38, vcc ; GISEL-NEXT: v_addc_u32_e32 v39, vcc, -1, v39, vcc @@ -2376,7 +2384,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_or_b32_e32 v0, v36, v38 ; GISEL-NEXT: v_or_b32_e32 v1, v37, v39 ; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v50, v2, vcc -; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v51, v27, vcc +; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v51, v55, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v22 ; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -2384,13 +2392,13 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_and_b32_e32 v1, v0, v35 ; GISEL-NEXT: v_and_b32_e32 v25, v0, v34 ; GISEL-NEXT: v_and_b32_e32 v26, v0, v4 -; GISEL-NEXT: v_and_b32_e32 v52, v0, v5 +; GISEL-NEXT: v_and_b32_e32 v27, v0, v5 ; GISEL-NEXT: v_sub_i32_e32 v24, vcc, v3, v1 ; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v53, v25, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v22 ; GISEL-NEXT: v_mov_b32_e32 v1, v23 ; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v2, v26, vcc -; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v52, vcc +; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v55, v27, vcc ; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GISEL-NEXT: s_cbranch_execnz .LBB2_9 ; GISEL-NEXT: ; %bb.10: ; %Flow @@ -2455,6 +2463,9 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v7, v28, vcc ; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v33, s[8:9] ; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v8, v33, vcc +; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] %shl = srem <2 x i128> %lhs, %rhs ret <2 x i128> %shl @@ -3025,6 +3036,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v18, v39, vcc ; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v48, vcc ; GISEL-NEXT: v_or_b32_e32 v20, v20, v38 +; GISEL-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19 ; GISEL-NEXT: v_mov_b32_e32 v16, v24 ; GISEL-NEXT: v_mov_b32_e32 v17, v25 ; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] @@ -3167,16 +3179,17 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_lshl_b64 v[18:19], v[24:25], 1 ; GISEL-NEXT: v_lshrrev_b32_e32 v26, 31, v25 ; GISEL-NEXT: v_lshl_b64 v[50:51], v[28:29], 1 -; GISEL-NEXT: v_lshl_b64 v[30:31], v[30:31], 1 -; GISEL-NEXT: v_lshrrev_b32_e32 v28, 31, v29 -; GISEL-NEXT: v_lshrrev_b32_e32 v29, 31, v23 +; GISEL-NEXT: v_lshl_b64 v[52:53], v[30:31], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v54, 31, v29 +; GISEL-NEXT: v_lshrrev_b32_e32 v55, 31, v23 ; GISEL-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 +; GISEL-NEXT: ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31 ; GISEL-NEXT: v_add_i32_e32 v34, vcc, -1, v34 ; GISEL-NEXT: v_addc_u32_e32 v35, vcc, -1, v35, vcc ; GISEL-NEXT: v_or_b32_e32 v24, v16, v18 ; GISEL-NEXT: v_or_b32_e32 v25, v17, v19 -; GISEL-NEXT: v_or_b32_e32 v18, v30, v28 -; GISEL-NEXT: v_or_b32_e32 v19, v50, v29 +; GISEL-NEXT: v_or_b32_e32 v18, v52, v54 +; GISEL-NEXT: v_or_b32_e32 v19, v50, v55 ; GISEL-NEXT: v_or_b32_e32 v22, v22, v26 ; GISEL-NEXT: v_addc_u32_e32 v36, vcc, -1, v36, vcc ; GISEL-NEXT: v_addc_u32_e32 v37, vcc, -1, v37, vcc @@ -3185,7 +3198,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_or_b32_e32 v16, v34, v36 ; GISEL-NEXT: v_or_b32_e32 v17, v35, v37 ; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v48, v18, vcc -; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v49, v31, vcc +; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v49, v53, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] ; GISEL-NEXT: v_ashrrev_i32_e32 v16, 31, v26 ; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -3193,13 +3206,13 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_and_b32_e32 v17, v16, v12 ; GISEL-NEXT: v_and_b32_e32 v29, v16, v13 ; GISEL-NEXT: v_and_b32_e32 v30, v16, v14 -; GISEL-NEXT: v_and_b32_e32 v50, v16, v15 +; GISEL-NEXT: v_and_b32_e32 v31, v16, v15 ; GISEL-NEXT: v_sub_i32_e32 v28, vcc, v19, v17 ; GISEL-NEXT: v_subb_u32_e32 v29, vcc, v51, v29, vcc ; GISEL-NEXT: v_mov_b32_e32 v16, v26 ; GISEL-NEXT: v_mov_b32_e32 v17, v27 ; GISEL-NEXT: v_subb_u32_e32 v30, vcc, v18, v30, vcc -; GISEL-NEXT: v_subb_u32_e32 v31, vcc, v31, v50, vcc +; GISEL-NEXT: v_subb_u32_e32 v31, vcc, v53, v31, vcc ; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GISEL-NEXT: s_cbranch_execnz .LBB3_9 ; GISEL-NEXT: ; %bb.10: ; %Flow diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll index 9c59b4236cae4..628f1e18b33e1 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll @@ -562,11 +562,12 @@ define i32 @divergent_vec_i16_HH(i32 %a, i32 %b) { ; ; GFX11-TRUE16-LABEL: divergent_vec_i16_HH: ; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: divergent_vec_i16_HH: diff --git a/llvm/test/CodeGen/AMDGPU/early-lis-two-address-partial-def.mir b/llvm/test/CodeGen/AMDGPU/early-lis-two-address-partial-def.mir index 186b171f4e805..588d3e0378502 100644 --- a/llvm/test/CodeGen/AMDGPU/early-lis-two-address-partial-def.mir +++ b/llvm/test/CodeGen/AMDGPU/early-lis-two-address-partial-def.mir @@ -30,10 +30,11 @@ body: | ; GFX90A-NEXT: [[COPY10:%[0-9]+]].sub5:sgpr_256 = COPY [[COPY4]] ; GFX90A-NEXT: [[COPY10:%[0-9]+]].sub6:sgpr_256 = COPY [[COPY3]] ; GFX90A-NEXT: [[COPY10:%[0-9]+]].sub7:sgpr_256 = COPY [[COPY2]] - ; GFX90A-NEXT: undef [[COPY11:%[0-9]+]].sub0:vreg_64_align2 = COPY [[COPY]] - ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = IMAGE_ATOMIC_SWAP_V1_V1_gfx90a [[COPY12]], [[COPY11]].sub0, [[COPY10]], 1, -1, 1, 0, 0, 0, implicit $exec, implicit [[COPY11]] :: (volatile dereferenceable load store (s32), addrspace 8) - ; GFX90A-NEXT: $vgpr0 = COPY [[COPY12]] + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF:%[0-9]+]].sub0:vreg_64_align2 = COPY [[COPY]] + ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = IMAGE_ATOMIC_SWAP_V1_V1_gfx90a [[COPY11]], [[DEF]].sub0, [[COPY10]], 1, -1, 1, 0, 0, 0, implicit $exec, implicit [[DEF]] :: (volatile dereferenceable load store (s32), addrspace 8) + ; GFX90A-NEXT: $vgpr0 = COPY [[COPY11]] ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %9:vgpr_32 = COPY $vgpr1 %8:vgpr_32 = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 6a898fa799f3e..5c691c96de75d 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -52,11 +52,22 @@ define bfloat @v_copysign_bf16_bf16(bfloat %mag, bfloat %sign) { ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_copysign_bf16_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_copysign_bf16_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v3, v2 +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_copysign_bf16_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) ret bfloat %op } @@ -108,10 +119,13 @@ define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) { ; ; GFX11TRUE16-LABEL: v_copysign_bf16_s_bf16: ; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v2, v1 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11FAKE16-LABEL: v_copysign_bf16_s_bf16: @@ -170,10 +184,13 @@ define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) { ; ; GFX11TRUE16-LABEL: v_copysign_s_bf16_bf16: ; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s0 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0 +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v2, v1 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11FAKE16-LABEL: v_copysign_s_bf16_bf16: @@ -233,10 +250,11 @@ define bfloat @v_copysign_bf16_f32(bfloat %mag, float %sign.f32) { ; ; GFX11TRUE16-LABEL: v_copysign_bf16_f32: ; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, v1 +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v2, v1 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -300,10 +318,11 @@ define bfloat @v_copysign_bf16_f64(bfloat %mag, double %sign.f64) { ; ; GFX11TRUE16-LABEL: v_copysign_bf16_f64: ; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, v2 +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v1, v2 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -362,11 +381,22 @@ define bfloat @v_copysign_bf16_f16(bfloat %mag, half %sign.f16) { ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_copysign_bf16_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_copysign_bf16_f16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v3, v2 +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_copysign_bf16_f16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %sign = bitcast half %sign.f16 to bfloat %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) ret bfloat %op @@ -423,6 +453,8 @@ define amdgpu_ps i32 @s_copysign_bf16_bf16(bfloat inreg %mag, bfloat inreg %sign ; ; GFX11TRUE16-LABEL: s_copysign_bf16_bf16: ; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr0 +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s1 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s0 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -499,6 +531,7 @@ define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f ; ; GFX11TRUE16-LABEL: s_copysign_bf16_f32: ; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, s1 @@ -575,6 +608,7 @@ define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign. ; ; GFX11TRUE16-LABEL: s_copysign_bf16_f64: ; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, s2 @@ -651,6 +685,8 @@ define amdgpu_ps i32 @s_copysign_bf16_f16(bfloat inreg %mag, half inreg %sign.f1 ; ; GFX11TRUE16-LABEL: s_copysign_bf16_f16: ; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr0 +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s1 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s0 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -719,10 +755,11 @@ define float @v_copysign_f32_bf16(float %mag, bfloat %sign.bf16) { ; ; GFX11TRUE16-LABEL: v_copysign_f32_bf16: ; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1 +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v2 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11FAKE16-LABEL: v_copysign_f32_bf16: @@ -783,6 +820,7 @@ define amdgpu_ps i32 @s_copysign_f32_bf16(float inreg %mag, bfloat inreg %sign.b ; ; GFX11TRUE16-LABEL: s_copysign_f32_bf16: ; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s1 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 @@ -850,11 +888,22 @@ define half @v_copysign_f16_bf16(half %mag, bfloat %sign.bf16) { ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_copysign_f16_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_copysign_f16_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v3, v2 +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_copysign_f16_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %sign = bitcast bfloat %sign.bf16 to half %op = call half @llvm.copysign.f16(half %mag, half %sign) ret half %op @@ -917,6 +966,8 @@ define amdgpu_ps i32 @s_copysign_f16_bf16(half inreg %mag, bfloat inreg %sign.bf ; ; GFX11TRUE16-LABEL: s_copysign_f16_bf16: ; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr0 +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s1 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s0 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -985,10 +1036,11 @@ define double @v_copysign_f64_bf16(double %mag, bfloat %sign.bf16) { ; ; GFX11TRUE16-LABEL: v_copysign_f64_bf16: ; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v2.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2 +; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v3 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11FAKE16-LABEL: v_copysign_f64_bf16: @@ -1049,6 +1101,7 @@ define amdgpu_ps <2 x i32> @s_copysign_f64_bf16(double inreg %mag, bfloat inreg ; ; GFX11TRUE16-LABEL: s_copysign_f64_bf16: ; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s2 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 @@ -3545,6 +3598,7 @@ define amdgpu_ps i32 @s_copysign_out_f32_mag_f32_sign_bf16(float inreg %mag, bfl ; ; GFX11TRUE16-LABEL: s_copysign_out_f32_mag_f32_sign_bf16: ; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s1 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 @@ -3610,6 +3664,7 @@ define amdgpu_ps <2 x i32> @s_copysign_out_f64_mag_f64_sign_bf16(double inreg %m ; ; GFX11TRUE16-LABEL: s_copysign_out_f64_mag_f64_sign_bf16: ; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s2 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 @@ -3677,6 +3732,7 @@ define amdgpu_ps i16 @s_copysign_out_bf16_mag_bf16_sign_f32(bfloat inreg %mag, f ; ; GFX11TRUE16-LABEL: s_copysign_out_bf16_mag_bf16_sign_f32: ; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, s1 @@ -3744,6 +3800,7 @@ define amdgpu_ps i16 @s_copysign_out_bf16_mag_bf16_sign_f64(bfloat inreg %mag, d ; ; GFX11TRUE16-LABEL: s_copysign_out_bf16_mag_bf16_sign_f64: ; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, s2 @@ -3840,14 +3897,16 @@ define amdgpu_ps i16 @s_copysign_out_bf16_mag_f32_sign_bf16(float inreg %mag, bf ; GFX11TRUE16-NEXT: s_add_i32 s2, s2, s0 ; GFX11TRUE16-NEXT: s_bitset1_b32 s0, 22 ; GFX11TRUE16-NEXT: s_addk_i32 s2, 0x7fff -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr0 +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11TRUE16-NEXT: s_and_b32 s3, s3, exec_lo ; GFX11TRUE16-NEXT: s_cselect_b32 s0, s0, s2 -; GFX11TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 ; GFX11TRUE16-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11TRUE16-NEXT: ; return to shader part epilog ; @@ -3997,10 +4056,11 @@ define <2 x float> @v_copysign_out_v2f32_mag_v2f32_sign_v2bf16(<2 x float> %mag, ; ; GFX11TRUE16-LABEL: v_copysign_out_v2f32_mag_v2f32_sign_v2bf16: ; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v3 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4071,10 +4131,11 @@ define <2 x double> @v_copysign_out_v2f64_mag_v2f64_sign_v2bf16(<2 x double> %ma ; ; GFX11TRUE16-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2bf16: ; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr5 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.h, v4.l ; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v4 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.h, v4.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v5 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4864,14 +4925,16 @@ define amdgpu_ps <2 x i32> @s_copysign_out_v2f32_mag_v2f32_sign_v2bf16(<2 x floa ; GFX11TRUE16-LABEL: s_copysign_out_v2f32_mag_v2f32_sign_v2bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s2 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s3 +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr0 +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s2 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s3 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 -; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v1 +; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, s0, v1 +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v0 ; GFX11TRUE16-NEXT: ; return to shader part epilog ; ; GFX11FAKE16-LABEL: s_copysign_out_v2f32_mag_v2f32_sign_v2bf16: @@ -4961,6 +5024,8 @@ define amdgpu_ps <4 x i32> @s_copysign_out_v2f64_mag_v2f64_sign_v2bf16(<2 x doub ; GFX11TRUE16-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_lshr_b32 s5, s4, 16 +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr0 +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s4 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s5 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -5762,13 +5827,15 @@ define <3 x float> @v_copysign_out_v3f32_mag_v3f32_sign_v3bf16(<3 x float> %mag, ; ; GFX11TRUE16-LABEL: v_copysign_out_v3f32_mag_v3f32_sign_v3bf16: ; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr5 +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr6 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.h, v3.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.h, v4.l ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v3 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v4 -; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v5 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v5 +; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v6 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11FAKE16-LABEL: v_copysign_out_v3f32_mag_v3f32_sign_v3bf16: @@ -5848,13 +5915,15 @@ define <3 x double> @v_copysign_out_v3f64_mag_v3f64_sign_v3bf16(<3 x double> %ma ; ; GFX11TRUE16-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3bf16: ; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr8 +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr9 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.h, v6.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l ; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v6 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v7 -; GFX11TRUE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v8 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.h, v6.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.h, v7.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v8 +; GFX11TRUE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v9 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11FAKE16-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3bf16: @@ -6024,15 +6093,16 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3f32_sign_v3bf16(<3 x float> %ma ; GFX11TRUE16-NEXT: v_add3_u32 v8, v8, v1, 0x7fff ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v9, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr5 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v10, vcc_lo ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v1, v3 -; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v2, v4 +; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v5, v4 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11FAKE16-LABEL: v_copysign_out_v3bf16_mag_v3f32_sign_v3bf16: @@ -6343,11 +6413,12 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3f64_sign_v3bf16(<3 x double> %m ; GFX11TRUE16-NEXT: v_add3_u32 v10, v14, v10, 0x7fff ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v8, v15, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f64_e32 vcc_lo, v[4:5], v[4:5] +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v9, v13, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f64_e32 vcc_lo, v[2:3], v[2:3] -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v7 +; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v4, v7 ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v2, v6 @@ -6566,14 +6637,15 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3bf16_sign_v3f32(<3 x bfloat> %m ; GFX11TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v9, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr5 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc_lo ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v10, vcc_lo ; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v2 +; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v5 ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v3 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6699,15 +6771,18 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64(<3 x bfloat> %m ; ; GFX11TRUE16-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64: ; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, v5 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff0000, v1, v3 +; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0x7fff0000, v4, v3 ; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0x7fff0000, v2, v7 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6905,12 +6980,14 @@ define <4 x float> @v_copysign_out_v4f32_mag_v4f32_sign_v4bf16(<4 x float> %mag, ; ; GFX11TRUE16-LABEL: v_copysign_out_v4f32_mag_v4f32_sign_v4bf16: ; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr6 +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr7 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v4 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l ; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.h, v5.l -; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v4 ; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v5 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v6 ; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v7 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -7007,12 +7084,14 @@ define <4 x double> @v_copysign_out_v4f64_mag_v4f64_sign_v4bf16(<4 x double> %ma ; ; GFX11TRUE16-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4bf16: ; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr10 +; GFX11TRUE16-NEXT: ; implicit-def: $vgpr11 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v8 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.h, v8.l ; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v8 ; GFX11TRUE16-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v9 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v10 ; GFX11TRUE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v11 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index 574c1042859aa..5d42165b8ae81 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -45,6 +45,8 @@ define amdgpu_ps i16 @s_copysign_f16(half inreg %mag, half inreg %sign) { ; ; GFX11-TRUE16-LABEL: s_copysign_f16: ; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s1 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -385,11 +387,22 @@ define half @v_copysign_f16(half %mag, half %sign) { ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_copysign_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_copysign_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v3, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_copysign_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call half @llvm.copysign.f16(half %mag, half %sign) ret half %result } @@ -688,10 +701,11 @@ define float @v_copysign_out_f32_mag_f32_sign_f16(float %mag, half %sign) { ; ; GFX11-TRUE16-LABEL: v_copysign_out_f32_mag_f32_sign_f16: ; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v2 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_copysign_out_f32_mag_f32_sign_f16: @@ -732,10 +746,11 @@ define double @v_copysign_out_f64_mag_f64_sign_f16(double %mag, half %sign) { ; ; GFX11-TRUE16-LABEL: v_copysign_out_f64_mag_f64_sign_f16: ; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2 +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_copysign_out_f64_mag_f64_sign_f16: @@ -778,10 +793,11 @@ define half @v_copysign_out_f16_mag_f16_sign_f32(half %mag, float %sign) { ; ; GFX11-TRUE16-LABEL: v_copysign_out_f16_mag_f16_sign_f32: ; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, v1 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v2, v1 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -825,10 +841,11 @@ define half @v_copysign_out_f16_mag_f16_sign_f64(half %mag, double %sign) { ; ; GFX11-TRUE16-LABEL: v_copysign_out_f16_mag_f16_sign_f64: ; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, v2 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v1, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -872,10 +889,12 @@ define half @v_copysign_out_f16_mag_f32_sign_f16(float %mag, half %sign) { ; ; GFX11-TRUE16-LABEL: v_copysign_out_f16_mag_f32_sign_f16: ; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v2 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_copysign_out_f16_mag_f32_sign_f16: @@ -1070,13 +1089,14 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) { ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, 0x7c00, v5 :: v_dual_add_nc_u32 v3, v3, v4 ; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v4 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_copysign_out_f16_mag_f64_sign_f16: @@ -1294,11 +1314,13 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffe ; GFX11-TRUE16-NEXT: v_med3_i32 v1, s3, 0, 13 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v1 ; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s2 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-NEXT: s_or_b32 s3, s1, 0x1000 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s2 ; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s3, s4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s5, s4 @@ -2732,6 +2754,7 @@ define amdgpu_ps i32 @s_copysign_out_f32_mag_f32_sign_f16(float inreg %mag, half ; ; GFX11-TRUE16-LABEL: s_copysign_out_f32_mag_f32_sign_f16: ; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 @@ -2781,6 +2804,7 @@ define amdgpu_ps <2 x i32> @s_copysign_out_f64_mag_f64_sign_f16(double inreg %ma ; ; GFX11-TRUE16-LABEL: s_copysign_out_f64_mag_f64_sign_f16: ; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 @@ -2832,6 +2856,7 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f16_sign_f32(half inreg %mag, float ; ; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f16_sign_f32: ; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, s1 @@ -2883,6 +2908,7 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f16_sign_f64(half inreg %mag, doubl ; ; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f16_sign_f64: ; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, s2 @@ -2934,10 +2960,11 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f32_sign_f16(float inreg %mag, half ; ; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f32_sign_f16: ; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-TRUE16-NEXT: ; return to shader part epilog ; @@ -3050,10 +3077,11 @@ define <2 x float> @v_copysign_out_v2f32_mag_v2f32_sign_v2f16(<2 x float> %mag, ; ; GFX11-TRUE16-LABEL: v_copysign_out_v2f32_mag_v2f32_sign_v2f16: ; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3106,10 +3134,11 @@ define <2 x double> @v_copysign_out_v2f64_mag_v2f64_sign_v2f16(<2 x double> %mag ; ; GFX11-TRUE16-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2f16: ; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v4.l ; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3858,14 +3887,16 @@ define amdgpu_ps <2 x i32> @s_copysign_out_v2f32_mag_v2f32_sign_v2f16(<2 x float ; GFX11-TRUE16-LABEL: s_copysign_out_v2f32_mag_v2f32_sign_v2f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, s3 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v1 +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, s0, v1 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s1, v0 ; GFX11-TRUE16-NEXT: ; return to shader part epilog ; ; GFX11-FAKE16-LABEL: s_copysign_out_v2f32_mag_v2f32_sign_v2f16: @@ -3931,6 +3962,8 @@ define amdgpu_ps <4 x i32> @s_copysign_out_v2f64_mag_v2f64_sign_v2f16(<2 x doubl ; GFX11-TRUE16-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s4, 16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, s5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -4670,13 +4703,15 @@ define <3 x float> @v_copysign_out_v3f32_mag_v3f32_sign_v3f16(<3 x float> %mag, ; ; GFX11-TRUE16-LABEL: v_copysign_out_v3f32_mag_v3f32_sign_v3f16: ; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v4.l ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v4 -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v5 +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v6 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_copysign_out_v3f32_mag_v3f32_sign_v3f16: @@ -4735,13 +4770,15 @@ define <3 x double> @v_copysign_out_v3f64_mag_v3f64_sign_v3f16(<3 x double> %mag ; ; GFX11-TRUE16-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3f16: ; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l ; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v7 -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v8 +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v9 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3f16: @@ -5589,15 +5626,18 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f16_sign_v3f64(<3 x half> %mag, <3 ; ; GFX11-TRUE16-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f64: ; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff0000, v1, v3 +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0x7fff0000, v4, v3 ; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff0000, v2, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5757,12 +5797,14 @@ define <4 x float> @v_copysign_out_v4f32_mag_v4f32_sign_v4f16(<4 x float> %mag, ; ; GFX11-TRUE16-LABEL: v_copysign_out_v4f32_mag_v4f32_sign_v4f16: ; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v5.l -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v4 ; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v6 ; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v7 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -5834,12 +5876,14 @@ define <4 x double> @v_copysign_out_v4f64_mag_v4f64_sign_v4f16(<4 x double> %mag ; ; GFX11-TRUE16-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4f16: ; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v8 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v8.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v8 ; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v9 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v10 ; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v11 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll index 0a2e758f7cf21..fc4de0645d217 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll @@ -999,6 +999,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, ; GFX11-LABEL: s_test_copysign_f32_fpext_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: ; implicit-def: $vgpr0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b16_e32 v0.h, s3 @@ -1094,6 +1095,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out, ; GFX11-LABEL: s_test_copysign_f32_fpext_bf16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: ; implicit-def: $vgpr0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b16_e32 v0.h, s3 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll index 8b5c34d97e50e..8a9a8c3f121d3 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll @@ -372,6 +372,7 @@ define amdgpu_kernel void @s_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i ; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x74 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c ; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: ; implicit-def: $vgpr0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_mov_b16_e32 v0.h, s6 @@ -971,10 +972,11 @@ define double @v_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i32], double ; ; GFX11-LABEL: v_test_copysign_f64_f16: ; GFX11: ; %bb.0: +; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b16_e32 v1.h, v20.l ; GFX11-NEXT: v_mov_b32_e32 v0, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_mov_b16_e32 v1.h, v20.l +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v11, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %sign.ext = fpext half %sign to double diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index e7af7467171c3..a51790693ff16 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -98,19 +98,21 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v6, v0, v4, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v5, v1, v5, s[6:7] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v7, v0, v4, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v6, v1, v5, s[6:7] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v10, v[1:2] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v10, v[1:2] ; SDAG-NEXT: v_mov_b32_e32 v1, v3 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v8, v[1:2] +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v8, v[1:2] ; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v4, v2 ; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v8, v[2:3] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, v[2:3] -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v6, v[3:4] -; SDAG-NEXT: v_mad_i32_i24 v3, v9, v5, v3 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v8, v[2:3] +; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v7, v[2:3] +; SDAG-NEXT: v_mov_b32_e32 v4, v3 +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[4:5] +; SDAG-NEXT: v_mad_i32_i24 v3, v9, v6, v3 ; SDAG-NEXT: .LBB0_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] ; SDAG-NEXT: .LBB0_7: ; %Flow2 @@ -464,19 +466,21 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v6, v0, v4, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v5, v1, v5, s[6:7] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v7, v0, v4, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v6, v1, v5, s[6:7] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v10, v[1:2] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v10, v[1:2] ; SDAG-NEXT: v_mov_b32_e32 v1, v3 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v8, v[1:2] +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v8, v[1:2] ; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v4, v2 ; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v8, v[2:3] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, v[2:3] -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v6, v[3:4] -; SDAG-NEXT: v_mad_i32_i24 v3, v9, v5, v3 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v8, v[2:3] +; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v7, v[2:3] +; SDAG-NEXT: v_mov_b32_e32 v4, v3 +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[4:5] +; SDAG-NEXT: v_mad_i32_i24 v3, v9, v6, v3 ; SDAG-NEXT: .LBB1_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] ; SDAG-NEXT: .LBB1_7: ; %Flow2 diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index c4a38dcd7b5f3..6b9e1febd6606 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -538,8 +538,9 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v3, v2 ; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB0_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v3, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, v5, v4 @@ -622,6 +623,7 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v2 @@ -770,8 +772,9 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s1, s0 ; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l ; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s1, s0 ; GFX1150-TRUE16-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -858,6 +861,7 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX1150-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1150-TRUE16-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l ; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v2 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1015,8 +1019,9 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s1, s0 ; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l ; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s1, s0 ; GFX1200-TRUE16-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1106,6 +1111,7 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX1200-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-TRUE16-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l ; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v2 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5795,8 +5801,9 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v4, v3 ; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB9_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v4, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v5, v2 @@ -5879,6 +5886,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v2 @@ -5893,8 +5901,9 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v5 ; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB9_10 ; GFX11-TRUE16-NEXT: ; %bb.9: ; %frem.else20 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.l ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0x7fff, v8, v7 @@ -5977,6 +5986,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_ldexp_f32 v5, v6, v5 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v5.l, v5 @@ -6233,8 +6243,9 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s6, s5 ; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4 +; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4 ; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s6, s5 ; GFX1150-TRUE16-NEXT: s_cselect_b32 s7, -1, 0 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6321,6 +6332,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1150-TRUE16-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, s4 ; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6336,8 +6348,9 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s8, s7 ; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB9_10 ; GFX1150-TRUE16-NEXT: ; %bb.9: ; %frem.else20 -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, s6 +; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, s6 ; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s8, s7 ; GFX1150-TRUE16-NEXT: s_cselect_b32 s9, -1, 0 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6424,6 +6437,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1150-TRUE16-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, s6 ; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6703,8 +6717,9 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s6, s5 ; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4 +; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4 ; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s6, s5 ; GFX1200-TRUE16-NEXT: s_cselect_b32 s7, -1, 0 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6794,6 +6809,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-TRUE16-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, s4 ; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6812,8 +6828,9 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s8, s7 ; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB9_10 ; GFX1200-TRUE16-NEXT: ; %bb.9: ; %frem.else20 -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, s6 +; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, s6 ; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s8, s7 ; GFX1200-TRUE16-NEXT: s_cselect_b32 s9, -1, 0 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6904,6 +6921,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-TRUE16-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v2.l, s6 ; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -8976,8 +8994,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v5 ; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, v7, v4 @@ -9060,6 +9079,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_ldexp_f32 v4, v5, v4 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v4.l, v4 @@ -9074,8 +9094,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v9, v8 ; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_10 ; GFX11-TRUE16-NEXT: ; %bb.9: ; %frem.else20 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v9, v8 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0x7fff, v10, v7 @@ -9158,6 +9179,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_ldexp_f32 v7, v8, v7 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v7.l, v7 @@ -9169,8 +9191,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v10, v9 ; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_18 ; GFX11-TRUE16-NEXT: ; %bb.17: ; %frem.else53 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v10, v9 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0x7fff, v11, v8 @@ -9253,6 +9276,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc_lo ; GFX11-TRUE16-NEXT: v_ldexp_f32 v8, v9, v8 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v8.l, v8 @@ -9267,8 +9291,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v13, v12 ; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_26 ; GFX11-TRUE16-NEXT: ; %bb.25: ; %frem.else86 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v9.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v9.l ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v13, v12 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0x7fff, v14, v11 @@ -9351,6 +9376,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc_lo ; GFX11-TRUE16-NEXT: v_ldexp_f32 v11, v12, v11 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v11.l, v11 @@ -9817,8 +9843,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s8, s6 ; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v0.l, s5 +; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v0.l, s5 ; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s8, s6 ; GFX1150-TRUE16-NEXT: s_cselect_b32 s9, -1, 0 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9905,6 +9932,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1150-TRUE16-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, s5 ; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -9920,8 +9948,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s10, s9 ; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_10 ; GFX1150-TRUE16-NEXT: ; %bb.9: ; %frem.else20 -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, s8 +; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, s8 ; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s10, s9 ; GFX1150-TRUE16-NEXT: s_cselect_b32 s11, -1, 0 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10008,6 +10037,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1150-TRUE16-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, s8 ; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10021,8 +10051,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s10, s9 ; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_18 ; GFX1150-TRUE16-NEXT: ; %bb.17: ; %frem.else53 -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, s7 +; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, s7 ; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s10, s9 ; GFX1150-TRUE16-NEXT: s_cselect_b32 s11, -1, 0 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10109,6 +10140,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1150-TRUE16-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v3.l, s7 ; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v2 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10124,8 +10156,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s12, s11 ; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_26 ; GFX1150-TRUE16-NEXT: ; %bb.25: ; %frem.else86 -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v3.l, s10 +; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v3.l, s10 ; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s12, s11 ; GFX1150-TRUE16-NEXT: s_cselect_b32 s13, -1, 0 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10212,6 +10245,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1150-TRUE16-NEXT: v_ldexp_f32 v3, v4, v3 +; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v4.l, s10 ; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v3.l, v3 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10725,8 +10759,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s8, s6 ; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v0.l, s5 +; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v0.l, s5 ; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s8, s6 ; GFX1200-TRUE16-NEXT: s_cselect_b32 s9, -1, 0 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10816,6 +10851,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-TRUE16-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, s5 ; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10834,8 +10870,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s10, s9 ; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_10 ; GFX1200-TRUE16-NEXT: ; %bb.9: ; %frem.else20 -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, s8 +; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, s8 ; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s10, s9 ; GFX1200-TRUE16-NEXT: s_cselect_b32 s11, -1, 0 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10926,6 +10963,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-TRUE16-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v2.l, s8 ; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10941,8 +10979,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s10, s9 ; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_18 ; GFX1200-TRUE16-NEXT: ; %bb.17: ; %frem.else53 -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v2.l, s7 +; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v2.l, s7 ; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s10, s9 ; GFX1200-TRUE16-NEXT: s_cselect_b32 s11, -1, 0 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -11034,6 +11073,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-TRUE16-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v3.l, s7 ; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v2 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -11052,8 +11092,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s12, s11 ; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_26 ; GFX1200-TRUE16-NEXT: ; %bb.25: ; %frem.else86 -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v3.l, s10 +; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v3.l, s10 ; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s12, s11 ; GFX1200-TRUE16-NEXT: s_cselect_b32 s13, -1, 0 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -11144,6 +11185,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-TRUE16-NEXT: v_ldexp_f32 v3, v4, v3 +; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v4.l, s10 ; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v3.l, v3 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll index f67ab18dd8ef1..5a4cc8892a422 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -17424,32 +17424,62 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: test_call_external_void_func_v1bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, s33 -; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s1, external_void_func_v1bf16@abs32@hi -; GFX11-NEXT: s_mov_b32 s0, external_void_func_v1bf16@abs32@lo -; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_mov_b32 s32, s33 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 -; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_mov_b32 s33, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: test_call_external_void_func_v1bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s33 +; GFX11-TRUE16-NEXT: s_mov_b32 s33, s32 +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_v1bf16@abs32@hi +; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_v1bf16@abs32@lo +; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-TRUE16-NEXT: s_mov_b32 s33, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: test_call_external_void_func_v1bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s0, s33 +; GFX11-FAKE16-NEXT: s_mov_b32 s33, s32 +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_v1bf16@abs32@hi +; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_v1bf16@abs32@lo +; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33 +; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-FAKE16-NEXT: s_mov_b32 s33, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v1bf16: ; GFX10-SCRATCH: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll index 492bd1b508bc6..2f6e237b693d9 100644 --- a/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll @@ -536,28 +536,29 @@ define <3 x bfloat> @v_uitofp_v3i1_to_v3bf16(<3 x i1> %num) { ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v1.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v1, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -614,31 +615,31 @@ define <3 x bfloat> @v_uitofp_v3i1_to_v3bf16(<3 x i1> %num) { ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h -; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v1.l -; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, vcc_lo +; GFX12-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v1, v7, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h @@ -1862,28 +1863,29 @@ define <3 x bfloat> @v_sitofp_v3i1_to_v3bf16(<3 x i1> %num) { ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v1.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v1, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -1940,31 +1942,31 @@ define <3 x bfloat> @v_sitofp_v3i1_to_v3bf16(<3 x i1> %num) { ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h -; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v1.l -; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, vcc_lo +; GFX12-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v1, v7, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll index ab38bd21994ec..35eed55eae194 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -1161,6 +1161,8 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX11-DL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-TRUE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-DL-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-DL-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-DL-TRUE16-NEXT: ; implicit-def: $vgpr5 ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -1169,28 +1171,28 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, ; GFX11-DL-TRUE16-NEXT: global_load_b32 v2, v0, s[2:3] ; GFX11-DL-TRUE16-NEXT: global_load_d16_b16 v0, v3, s[4:5] ; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v6, v1, 0, 8 +; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v8, v1, 0, 8 ; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v2, 0, 8 -; GFX11-DL-TRUE16-NEXT: v_ashrrev_i16 v4.h, 8, v1.l -; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11-DL-TRUE16-NEXT: v_ashrrev_i16 v7.h, 8, v2.l -; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l -; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v6, v1, 0, 8 +; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v7, v2, 0, 8 +; GFX11-DL-TRUE16-NEXT: v_ashrrev_i16 v6.h, 8, v1.l +; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h +; GFX11-DL-TRUE16-NEXT: v_ashrrev_i16 v9.h, 8, v2.l +; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h +; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v9.l, v7.l +; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, v8.l +; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX11-DL-TRUE16-NEXT: v_ashrrev_i16 v1.h, 8, v1.h -; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v2, 0, 8 +; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX11-DL-TRUE16-NEXT: v_ashrrev_i16 v2.h, 8, v2.h -; GFX11-DL-TRUE16-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v6.l +; GFX11-DL-TRUE16-NEXT: v_pk_mul_lo_u16 v6, v6, v9 +; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.l ; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v4.l, v0.l +; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v6.l, v0.l ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-DL-TRUE16-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v4.h +; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v6.h ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l ; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.h @@ -3424,6 +3426,7 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX11-DL-TRUE16: ; %bb.0: ; %entry ; GFX11-DL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-DL-TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX11-DL-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 @@ -3432,25 +3435,25 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX11-DL-TRUE16-NEXT: global_load_b32 v2, v0, s[0:1] ; GFX11-DL-TRUE16-NEXT: global_load_b32 v3, v0, s[2:3] ; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v1, v2, 0, 8 ; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l -; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v3 -; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.h -; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v4, v4, 0, 8 +; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 8, v3 +; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.h +; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.h ; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v1.l, v0.l -; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v5.l -; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v6, 0, 8 -; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l -; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 24, v2 +; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.l +; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v4, v4, 0, 8 +; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v5.l +; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 24, v2 ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v3 ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.l +; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v0.l ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v4, v4, 0, 8 +; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v4, v5, 0, 8 ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.h, v2.l, v0.l ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index 305461ed6b208..09487900587f6 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -1963,6 +1963,7 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX11-DL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-DL-TRUE16-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX11-DL-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-DL-TRUE16-NEXT: ; implicit-def: $vgpr6 ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -1974,23 +1975,23 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1, ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 ; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v4 -; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v6, v4, 0, 8 -; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.h +; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v7, v4, 0, 8 +; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.h ; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l ; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l -; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v7, v7, 0, 8 -; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v6.l -; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v3 +; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v6, v6, 0, 8 +; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.l +; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v0.l ; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v3.h -; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.l +; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v6.l ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v4 ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v2.l, v1.h, v0.l -; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v2, v6, 0, 8 +; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v2, v7, 0, 8 ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.h, v0.l ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index 792d7db26d076..51802beed4368 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -850,15 +850,16 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e64 v2, 16, s4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e64 v1, 16, s4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index 31b6b533866d4..f49ddcdf345c5 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -5778,25 +5778,31 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX7-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v0 ; GFX7-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2] -; GFX7-GISEL-NEXT: v_add_i32_e32 v8, vcc, v0, v6 +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[4:5] +; GFX7-GISEL-NEXT: v_add_i32_e32 v1, vcc, v0, v6 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v2, 0 ; GFX7-GISEL-NEXT: v_addc_u32_e32 v9, vcc, v4, v7, vcc -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2] +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v7, v6 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v3, v[7:8] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7] ; GFX7-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc +; GFX7-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v4, vcc ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0 -; GFX7-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v5 +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v5 ; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v2, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v7, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v8, 0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2] +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v2, vcc +; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v1 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v9, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v8, v[1:2] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_imad_pat_i64: @@ -5834,25 +5840,31 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 1, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2] -; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, v0, v6 +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[4:5] +; GFX8-GISEL-NEXT: v_add_u32_e32 v1, vcc, v0, v6 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v2, 0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, v4, v7, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2] +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v7, v6 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v3, v[7:8] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7] ; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 1, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc +; GFX8-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v4, vcc ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v7, vcc, 1, v5 +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 1, v5 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v2, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v7, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v8, 0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2] +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v2, vcc +; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v1 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v9, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v8, v[1:2] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-SDAG-LABEL: clpeak_imad_pat_i64: @@ -5886,25 +5898,31 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX900-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0 ; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, v0, v6 +; GFX900-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[4:5] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v1, vcc, v0, v6 ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v2, 0 ; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, v4, v7, vcc -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2] +; GFX900-GISEL-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v7, v6 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v3, v[7:8] ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7] ; GFX900-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v4, vcc ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0 -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v5 +; GFX900-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v5 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v2, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v7, v[0:1] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v8, 0 ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2] +; GFX900-GISEL-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v2, vcc +; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v9, v[5:6] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v8, v[1:2] ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-SDAG-LABEL: clpeak_imad_pat_i64: @@ -5991,26 +6009,32 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, 1 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v6, v3, v[1:2] -; GFX10-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v0, v6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v0, v6 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v6, v3, v[4:5] ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v7, v2, v[4:5] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v8, v2, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v1, v2, 0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v9, null, v4, v7, vcc_lo -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v8, v3, v[1:2] -; GFX10-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v0, 1 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v4, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v5, v8, 0 +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX10-GISEL-NEXT: v_add_co_u32 v10, vcc_lo, v0, 1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v7, v6 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v4, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v1, v3, v[7:8] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v5, v10, 0 ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v9, v2, v[6:7] -; GFX10-GISEL-NEXT: v_add_co_u32 v7, vcc_lo, v5, 1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v4 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v3, v7, 0 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v6, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v5, v10, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v3, v9, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v6, v8, v[4:5] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v4, v7, v[1:2] +; GFX10-GISEL-NEXT: v_add_co_u32 v9, vcc_lo, v5, 1 +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v7, v4 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v3, v9, 0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v6, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v5, v11, v[7:8] +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v7, v1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v6, v10, v[4:5] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v3, v12, v[7:8] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v1, v9, v[2:3] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: clpeak_imad_pat_i64: @@ -6049,37 +6073,45 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX11-GISEL-LABEL: clpeak_imad_pat_i64: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_co_u32 v7, vcc_lo, v0, 1 +; GFX11-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v0, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v8, null, 0, v1, vcc_lo -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v7, v2, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v7, v3, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v8, v2, v[4:5] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v7 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v4, v2, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v10, null, v5, v8, vcc_lo -; GFX11-GISEL-NEXT: v_add_co_u32 v11, vcc_lo, v0, 1 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v5, vcc_lo -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, v7 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v4, v3, v[1:2] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v6, v11, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v10, v2, v[7:8] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-GISEL-NEXT: v_add_co_u32 v9, vcc_lo, v6, 1 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v6, v12, v[2:3] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v3, v9, 0 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v8, vcc_lo -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v8, v11, v[4:5] +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v1, vcc_lo +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v8, v2, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v0, v8 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v3, v10, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v5, v9, v[6:7] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v8, v3, v[4:5] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v9, v2, v[6:7] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v1, v2, 0 +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v11, null, v4, v9, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v7, v6 +; GFX11-GISEL-NEXT: v_add_co_u32 v12, vcc_lo, v0, 1 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v4, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v1, v3, v[7:8] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v5, v12, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v11, v2, v[9:10] +; GFX11-GISEL-NEXT: v_add_co_u32 v11, vcc_lo, v5, 1 +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v3, v11, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v7, v4 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v6, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v5, v13, v[7:8] +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v6, v12, v[9:10] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v3, v2, v[4:5] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v7, v11, v[8:9] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-SDAG-LABEL: clpeak_imad_pat_i64: @@ -6410,50 +6442,60 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v0 ; GFX7-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc -; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v2 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 +; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v2 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v6, 0 ; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9] +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v5, v[2:3] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v10, v9 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v7, v[10:11] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v13, v4, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v15, v6, v[9:10] ; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, v0, v12 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11] -; GFX7-GISEL-NEXT: v_addc_u32_e32 v16, vcc, v8, v13, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0 -; GFX7-GISEL-NEXT: v_add_i32_e32 v17, vcc, v2, v14 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v9, v15, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15] -; GFX7-GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v8, vcc -; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v2 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v16, vcc, v1, v13, vcc +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX7-GISEL-NEXT: v_add_i32_e32 v17, vcc, v8, v14 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v11, v10 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[11:12] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v6, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v2, v15, vcc +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr14_vgpr15 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[10:11] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v14, v13 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v7, v[14:15] +; GFX7-GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[13:14] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v10, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v8 ; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v2, vcc, 0, v9, vcc -; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v10 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v7, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v11, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GFX7-GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v9 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v10, v[0:1] +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v3, vcc ; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v7 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1] -; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v13 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v4, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v2, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v13, 0 +; GFX7-GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v12 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v11, v[2:3] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v15, 0 +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v4, vcc +; GFX7-GISEL-NEXT: v_mov_b32_e32 v10, v1 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v14, v[10:11] +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v10, v3 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v16, v[10:11] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v13, v[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v15, v[10:11] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6515,50 +6557,60 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 1, v0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v2 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v2 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v6, 0 ; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9] +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v5, v[2:3] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v10, v9 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v7, v[10:11] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v13, v4, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v15, v6, v[9:10] ; GFX8-GISEL-NEXT: v_add_u32_e32 v3, vcc, v0, v12 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11] -; GFX8-GISEL-NEXT: v_addc_u32_e32 v16, vcc, v8, v13, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v17, vcc, v2, v14 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v9, v15, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15] -; GFX8-GISEL-NEXT: v_add_u32_e32 v11, vcc, 1, v0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v8, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 1, v2 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v16, vcc, v1, v13, vcc +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX8-GISEL-NEXT: v_add_u32_e32 v17, vcc, v8, v14 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v11, v10 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[11:12] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v6, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v2, v15, vcc +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr14_vgpr15 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[10:11] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v14, v13 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v7, v[14:15] +; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, 1, v0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[13:14] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v10, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-GISEL-NEXT: v_add_u32_e32 v11, vcc, 1, v8 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v2, vcc, 0, v9, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v10 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v7, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v11, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v13, vcc, 1, v9 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v10, v[0:1] +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v3, vcc ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1] -; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 1, v13 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v4, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v2, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v13, 0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v15, vcc, 1, v12 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v11, v[2:3] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v15, 0 +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v4, vcc +; GFX8-GISEL-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v14, v[10:11] +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v10, v3 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v16, v[10:11] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v13, v[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v15, v[10:11] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6612,50 +6664,60 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v0 ; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v2 ; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v2 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v6, 0 ; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v3, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9] +; GFX900-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX900-GISEL-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v5, v[2:3] +; GFX900-GISEL-NEXT: v_mov_b32_e32 v10, v9 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v7, v[10:11] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v13, v4, v[1:2] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v15, v6, v[9:10] ; GFX900-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, v0, v12 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11] -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v16, vcc, v8, v13, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0 -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v17, vcc, v2, v14 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, v9, v15, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v11, vcc, 1, v0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v8, vcc -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v2 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v16, vcc, v1, v13, vcc +; GFX900-GISEL-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v17, vcc, v8, v14 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v11, v10 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[11:12] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v6, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, v2, v15, vcc +; GFX900-GISEL-NEXT: ; implicit-def: $vgpr14_vgpr15 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[10:11] +; GFX900-GISEL-NEXT: v_mov_b32_e32 v14, v13 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v7, v[14:15] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 1, v0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[13:14] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v10, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; GFX900-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v11, vcc, 1, v8 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v9, vcc -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v10 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v7, v[0:1] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v11, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v13, vcc, 1, v9 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v10, v[0:1] +; GFX900-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v3, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v3, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, 1, v13 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v4, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v2, v[0:1] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v13, 0 +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v15, vcc, 1, v12 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v11, v[2:3] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v15, 0 +; GFX900-GISEL-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v4, vcc +; GFX900-GISEL-NEXT: v_mov_b32_e32 v10, v1 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v14, v[10:11] +; GFX900-GISEL-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v10, v3 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v16, v[10:11] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v13, v[4:5] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v15, v[10:11] ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6811,44 +6873,56 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v12, v4, 0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v3, vcc_lo ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v14, v6, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v12, v5, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v14, v7, v[3:4] -; GFX10-GISEL-NEXT: v_add_co_u32 v3, vcc_lo, v0, v12 +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr17_vgpr18 +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr19_vgpr20 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v8, v1 +; GFX10-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v0, v12 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v10, v3 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v12, v5, v[8:9] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v14, v7, v[10:11] ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v13, v4, v[8:9] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v3, v4, 0 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v16, null, v10, v13, vcc_lo -; GFX10-GISEL-NEXT: v_add_co_u32 v17, vcc_lo, v2, v14 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v1, v4, 0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v10, v13, vcc_lo +; GFX10-GISEL-NEXT: v_add_co_u32 v21, vcc_lo, v2, v14 ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v15, v6, v[9:10] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v17, v6, 0 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v18, null, v8, v15, vcc_lo -; GFX10-GISEL-NEXT: v_add_co_u32 v19, vcc_lo, v0, 1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v20, null, 0, v10, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v11, v19, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v3, v5, v[0:1] -; GFX10-GISEL-NEXT: v_add_co_u32 v15, vcc_lo, v2, 1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v17, v7, v[1:2] -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v8, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v13, v15, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v10 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v16, v4, v[14:15] -; GFX10-GISEL-NEXT: v_add_co_u32 v14, vcc_lo, v11, 1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v18, v6, v[0:1] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, v8 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v11, v20, v[1:2] -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v16, null, 0, v4, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v13, v12, v[6:7] -; GFX10-GISEL-NEXT: v_add_co_u32 v17, vcc_lo, v13, 1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v9, v14, 0 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v18, null, 0, v5, vcc_lo -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v4, v19, v[10:11] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v7, v17, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v5, v15, v[11:12] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v9, v16, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v7, v18, v[3:4] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v12, v14, v[5:6] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v4, v17, v[7:8] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v21, v6, 0 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v22, null, v8, v15, vcc_lo +; GFX10-GISEL-NEXT: v_add_co_u32 v23, vcc_lo, v0, 1 +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr15_vgpr16 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v17, v14 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v15, v12 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v10, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v11, v23, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v1, v5, v[15:16] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v21, v7, v[17:18] +; GFX10-GISEL-NEXT: v_add_co_u32 v17, vcc_lo, v2, 1 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v8, vcc_lo +; GFX10-GISEL-NEXT: v_mov_b32_e32 v19, v10 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v13, v17, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v3, v4, v[0:1] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v22, v6, v[14:15] +; GFX10-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v11, 1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v11, v24, v[19:20] +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v18, null, 0, v4, vcc_lo +; GFX10-GISEL-NEXT: v_mov_b32_e32 v11, v8 +; GFX10-GISEL-NEXT: v_add_co_u32 v21, vcc_lo, v13, 1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v9, v6, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v13, v15, v[11:12] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v7, v21, 0 +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr15_vgpr16 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v5, vcc_lo +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v4, v23, v[10:11] +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr13_vgpr14 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v15, v3 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v13, v1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v7, v22, v[15:16] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v5, v17, v[11:12] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v9, v18, v[13:14] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v12, v6, v[4:5] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v3, v21, v[7:8] ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6911,62 +6985,75 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX11-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_co_u32 v13, vcc_lo, v0, 1 +; GFX11-GISEL-NEXT: v_add_co_u32 v14, vcc_lo, v0, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v14, null, 0, v1, vcc_lo -; GFX11-GISEL-NEXT: v_add_co_u32 v15, vcc_lo, v2, 1 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v13, v4, 0 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v16, null, 0, v3, vcc_lo -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v15, v6, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v13, v5, v[1:2] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v15, v7, v[3:4] -; GFX11-GISEL-NEXT: v_add_co_u32 v3, vcc_lo, v0, v13 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v14, v4, v[8:9] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v16, v6, v[9:10] -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v17, null, v10, v14, vcc_lo -; GFX11-GISEL-NEXT: v_add_co_u32 v18, vcc_lo, v2, v15 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v3, v4, 0 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v19, null, v11, v16, vcc_lo -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v18, v6, 0 -; GFX11-GISEL-NEXT: v_add_co_u32 v20, vcc_lo, v0, 1 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v10, vcc_lo -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v9 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v8, v20, 0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v3, v5, v[0:1] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v10 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v18, v7, v[1:2] -; GFX11-GISEL-NEXT: v_add_co_u32 v18, vcc_lo, v2, 1 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v11, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v1, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_u32 v16, vcc_lo, v2, 1 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v14, v4, 0 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v3, vcc_lo ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[15:16], null, v17, v4, v[13:14] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v12, v18, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v19, v6, v[14:15] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v8, v21, v[0:1] -; GFX11-GISEL-NEXT: v_add_co_u32 v14, vcc_lo, v8, 1 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v16, null, 0, v15, vcc_lo -; GFX11-GISEL-NEXT: v_add_co_u32 v17, vcc_lo, v12, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v12, v22, v[5:6] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v9, v14, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v4, v17, 0 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v10, vcc_lo -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v15, v20, v[6:7] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v10, v18, v[7:8] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v16, v6, 0 +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr19_vgpr20 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v8, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v10, v3 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v14, v5, v[8:9] +; GFX11-GISEL-NEXT: v_add_co_u32 v3, vcc_lo, v0, v14 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v16, v7, v[10:11] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v15, v4, v[12:13] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v17, v6, v[8:9] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v3, v4, 0 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v21, null, v9, v15, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_u32 v22, vcc_lo, v2, v16 +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr15_vgpr16 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v23, null, v10, v17, vcc_lo +; GFX11-GISEL-NEXT: v_mov_b32_e32 v15, v12 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v22, v6, 0 +; GFX11-GISEL-NEXT: v_add_co_u32 v24, vcc_lo, v0, 1 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v3, v5, v[15:16] +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr17_vgpr18 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v9, vcc_lo ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v9, v16, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v4, v19, v[3:4] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v11, v14, v[7:8] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v12, v17, v[8:9] +; GFX11-GISEL-NEXT: v_mov_b32_e32 v17, v14 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v11, v24, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v22, v7, v[17:18] +; GFX11-GISEL-NEXT: v_add_co_u32 v18, vcc_lo, v2, 1 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[15:16], null, v21, v4, v[0:1] +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v10, vcc_lo +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v13, v18, 0 +; GFX11-GISEL-NEXT: v_add_co_u32 v21, vcc_lo, v11, 1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v19, v9 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v23, v6, v[14:15] +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v15, vcc_lo +; GFX11-GISEL-NEXT: v_add_co_u32 v23, vcc_lo, v13, 1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v11, v25, v[19:20] +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v8, v21, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v4, v23, 0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v10, v5 +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v9, vcc_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[16:17], null, v13, v12, v[10:11] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v15, v24, v[6:7] +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v5, v1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v11, v3 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v9, v18, v[16:17] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v8, v22, v[5:6] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v4, v26, v[11:12] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v10, v21, v[14:15] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v13, v23, v[7:8] ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll index b84fb520e0519..c1e4ff7f09a72 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll @@ -839,26 +839,51 @@ define float @test_sext_cvt_f32_fp8(i16 %a) { ; GFX9X-NEXT: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_1 ; GFX9X-NEXT: s_setpc_b64 s[30:31] ; -; GFX12-LABEL: test_sext_cvt_f32_fp8: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:1 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: test_sext_cvt_f32_fp8: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfe_i32 v0, v1, 0, 16 +; GFX12-TRUE16-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:1 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: test_sext_cvt_f32_fp8: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:1 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX12-FAKE16-LABEL: test_sext_cvt_f32_fp8: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:1 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-TRUE16-LABEL: test_sext_cvt_f32_fp8: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_bfe_i32 v0, v1, 0, 16 +; GFX1250-TRUE16-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:1 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: test_sext_cvt_f32_fp8: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:1 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] %a.sext = sext i16 %a to i32 %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a.sext, i32 1) ret float %ret @@ -872,26 +897,51 @@ define float @test_sext_cvt_f32_bf8(i16 %a) { ; GFX9X-NEXT: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_1 ; GFX9X-NEXT: s_setpc_b64 s[30:31] ; -; GFX12-LABEL: test_sext_cvt_f32_bf8: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:1 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: test_sext_cvt_f32_bf8: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfe_i32 v0, v1, 0, 16 +; GFX12-TRUE16-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:1 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: test_sext_cvt_f32_bf8: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:1 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX12-FAKE16-LABEL: test_sext_cvt_f32_bf8: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:1 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-TRUE16-LABEL: test_sext_cvt_f32_bf8: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_bfe_i32 v0, v1, 0, 16 +; GFX1250-TRUE16-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:1 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: test_sext_cvt_f32_bf8: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:1 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] %a.sext = sext i16 %a to i32 %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a.sext, i32 1) ret float %ret @@ -905,26 +955,51 @@ define <2 x float> @test_sext_cvt_pk_f32_bf8_word1(i16 %a) { ; GFX9X-NEXT: v_cvt_pk_f32_bf8_sdwa v[0:1], v0 src0_sel:WORD_1 ; GFX9X-NEXT: s_setpc_b64 s[30:31] ; -; GFX12-LABEL: test_sext_cvt_pk_f32_bf8_word1: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0] -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: test_sext_cvt_pk_f32_bf8_word1: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfe_i32 v0, v1, 0, 16 +; GFX12-TRUE16-NEXT: v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0] +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: test_sext_cvt_pk_f32_bf8_word1: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0] -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX12-FAKE16-LABEL: test_sext_cvt_pk_f32_bf8_word1: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0] +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-TRUE16-LABEL: test_sext_cvt_pk_f32_bf8_word1: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_bfe_i32 v0, v1, 0, 16 +; GFX1250-TRUE16-NEXT: v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0] +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: test_sext_cvt_pk_f32_bf8_word1: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0] +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] %a.sext = sext i16 %a to i32 %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a.sext, i1 true) ret <2 x float> %ret @@ -938,26 +1013,51 @@ define <2 x float> @test_sext_cvt_pk_f32_fp8_word0(i16 %a) { ; GFX9X-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 ; GFX9X-NEXT: s_setpc_b64 s[30:31] ; -; GFX12-LABEL: test_sext_cvt_pk_f32_fp8_word0: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: test_sext_cvt_pk_f32_fp8_word0: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_bfe_i32 v0, v1, 0, 16 +; GFX12-TRUE16-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: test_sext_cvt_pk_f32_fp8_word0: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX12-FAKE16-LABEL: test_sext_cvt_pk_f32_fp8_word0: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-TRUE16-LABEL: test_sext_cvt_pk_f32_fp8_word0: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_bfe_i32 v0, v1, 0, 16 +; GFX1250-TRUE16-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: test_sext_cvt_pk_f32_fp8_word0: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] %a.sext = sext i16 %a to i32 %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a.sext, i1 false) ret <2 x float> %ret diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll index d45705edce2c8..89133a690b856 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll @@ -98,9 +98,10 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX11-TRUE16: ; %bb.0: ; %main_body ; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; GFX11-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX11-TRUE16-NEXT: image_gather4 v[0:3], v[2:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -120,9 +121,11 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-TRUE16: ; %bb.0: ; %main_body ; GFX12-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l ; GFX12-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX12-TRUE16-NEXT: image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16 +; GFX12-TRUE16-NEXT: image_gather4 v[0:3], [v0, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; @@ -166,9 +169,10 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX11-TRUE16: ; %bb.0: ; %main_body ; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; GFX11-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX11-TRUE16-NEXT: image_gather4 v[0:3], v[2:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -188,9 +192,11 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX12-TRUE16: ; %bb.0: ; %main_body ; GFX12-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l ; GFX12-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX12-TRUE16-NEXT: image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16 +; GFX12-TRUE16-NEXT: image_gather4 v[0:3], [v0, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; @@ -300,9 +306,10 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX11-TRUE16: ; %bb.0: ; %main_body ; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; GFX11-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX11-TRUE16-NEXT: image_gather4_cl v[0:3], v[2:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -322,9 +329,11 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-TRUE16: ; %bb.0: ; %main_body ; GFX12-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l ; GFX12-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX12-TRUE16-NEXT: image_gather4_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX12-TRUE16-NEXT: image_gather4_cl v[0:3], [v0, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; @@ -370,9 +379,11 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX11-TRUE16: ; %bb.0: ; %main_body ; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l ; GFX11-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX11-TRUE16-NEXT: image_gather4_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX11-TRUE16-NEXT: image_gather4_c_cl v[0:3], [v0, v1, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: ; return to shader part epilog ; @@ -390,9 +401,11 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX12-TRUE16: ; %bb.0: ; %main_body ; GFX12-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l ; GFX12-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX12-TRUE16-NEXT: image_gather4_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX12-TRUE16-NEXT: image_gather4_c_cl v[0:3], [v0, v1, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; @@ -436,9 +449,12 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX11-TRUE16: ; %bb.0: ; %main_body ; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l ; GFX11-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX11-TRUE16-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX11-TRUE16-NEXT: image_gather4_b v[0:3], v[2:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: ; return to shader part epilog ; @@ -457,8 +473,10 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l ; GFX12-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX12-TRUE16-NEXT: image_gather4_b v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX12-TRUE16-NEXT: image_gather4_b v[0:3], [v2, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; @@ -503,8 +521,10 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l ; GFX11-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX11-TRUE16-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX11-TRUE16-NEXT: image_gather4_c_b v[0:3], [v3, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: ; return to shader part epilog ; @@ -523,8 +543,10 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l ; GFX12-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX12-TRUE16-NEXT: image_gather4_c_b v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX12-TRUE16-NEXT: image_gather4_c_b v[0:3], [v3, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; @@ -570,9 +592,11 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX11-TRUE16: ; %bb.0: ; %main_body ; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l ; GFX11-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX11-TRUE16-NEXT: image_gather4_b_cl v[0:3], v[2:4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 @@ -593,9 +617,13 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX12-TRUE16: ; %bb.0: ; %main_body ; GFX12-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l ; GFX12-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX12-TRUE16-NEXT: image_gather4_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX12-TRUE16-NEXT: image_gather4_b_cl v[0:3], [v2, v1, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; @@ -642,9 +670,13 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX11-TRUE16: ; %bb.0: ; %main_body ; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l ; GFX11-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX11-TRUE16-NEXT: image_gather4_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX11-TRUE16-NEXT: image_gather4_c_b_cl v[0:3], [v3, v1, v2, v5], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: ; return to shader part epilog ; @@ -662,9 +694,13 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-TRUE16: ; %bb.0: ; %main_body ; GFX12-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr5 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l ; GFX12-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX12-TRUE16-NEXT: image_gather4_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX12-TRUE16-NEXT: image_gather4_c_b_cl v[0:3], [v3, v1, v2, v5], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; @@ -700,9 +736,10 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; ; GFX11-TRUE16-LABEL: gather4_l_2d: ; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; GFX11-TRUE16-NEXT: image_gather4_l v[0:3], v[2:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: ; return to shader part epilog @@ -716,8 +753,10 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; ; GFX12-TRUE16-LABEL: gather4_l_2d: ; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l -; GFX12-TRUE16-NEXT: image_gather4_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX12-TRUE16-NEXT: image_gather4_l v[0:3], [v0, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; @@ -752,8 +791,10 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; ; GFX11-TRUE16-LABEL: gather4_c_l_2d: ; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l -; GFX11-TRUE16-NEXT: image_gather4_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l +; GFX11-TRUE16-NEXT: image_gather4_c_l v[0:3], [v0, v1, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: ; return to shader part epilog ; @@ -766,8 +807,10 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; ; GFX12-TRUE16-LABEL: gather4_c_l_2d: ; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l -; GFX12-TRUE16-NEXT: image_gather4_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l +; GFX12-TRUE16-NEXT: image_gather4_c_l v[0:3], [v0, v1, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll index 3d64ef16a3c8c..0e2579f0a1232 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll @@ -285,9 +285,10 @@ main_body: define amdgpu_ps <4 x float> @load_2dmsaa_a16(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %fragid) { ; GFX11-TRUE16-LABEL: load_2dmsaa_a16: ; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; encoding: [0x01,0x39,0x04,0x7f] ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; encoding: [0x02,0x39,0x06,0x7e] ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l ; encoding: [0x00,0x39,0x04,0x7e] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; encoding: [0x01,0x39,0x04,0x7f] ; GFX11-TRUE16-NEXT: image_msaa_load v[0:3], v[2:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x98,0x01,0x61,0xf0,0x02,0x00,0x00,0x00] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-TRUE16-NEXT: ; return to shader part epilog @@ -301,8 +302,10 @@ define amdgpu_ps <4 x float> @load_2dmsaa_a16(<8 x i32> inreg %rsrc, i16 %s, i16 ; ; GFX12-TRUE16-LABEL: load_2dmsaa_a16: ; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l ; encoding: [0x01,0x39,0x00,0x7f] -; GFX12-TRUE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x46,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x02,0x00,0x00] +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; encoding: [0x02,0x39,0x06,0x7e] +; GFX12-TRUE16-NEXT: image_msaa_load v[0:3], [v0, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x46,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00] ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf] ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll index 437f438efc554..866037e18de1a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll @@ -25,23 +25,45 @@ define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg % ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: sample_1d: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_mov_b32 s12, exec_lo -; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX11-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: ; return to shader part epilog -; -; GFX12-LABEL: sample_1d: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_mov_b32 s12, exec_lo -; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX12-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: sample_1d: +; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo +; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX11-TRUE16-NEXT: image_sample v[0:3], v1, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: sample_1d: +; GFX11-FAKE16: ; %bb.0: ; %main_body +; GFX11-FAKE16-NEXT: s_mov_b32 s12, exec_lo +; GFX11-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-FAKE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX11-FAKE16-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-TRUE16-LABEL: sample_1d: +; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: s_mov_b32 s12, exec_lo +; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX12-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX12-TRUE16-NEXT: image_sample v[0:3], v1, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-FAKE16-LABEL: sample_1d: +; GFX12-FAKE16: ; %bb.0: ; %main_body +; GFX12-FAKE16-NEXT: s_mov_b32 s12, exec_lo +; GFX12-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-FAKE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX12-FAKE16-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -139,9 +161,10 @@ define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg % ; GFX11-TRUE16: ; %bb.0: ; %main_body ; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; GFX11-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX11-TRUE16-NEXT: image_sample v[0:3], v[2:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -161,9 +184,11 @@ define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg % ; GFX12-TRUE16: ; %bb.0: ; %main_body ; GFX12-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l ; GFX12-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX12-TRUE16-NEXT: image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 +; GFX12-TRUE16-NEXT: image_sample v[0:3], [v0, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; @@ -207,9 +232,10 @@ define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX11-TRUE16: ; %bb.0: ; %main_body ; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; GFX11-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX11-TRUE16-NEXT: image_sample v[0:3], v[2:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -229,9 +255,11 @@ define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-TRUE16: ; %bb.0: ; %main_body ; GFX12-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l ; GFX12-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX12-TRUE16-NEXT: image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16 +; GFX12-TRUE16-NEXT: image_sample v[0:3], [v0, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; @@ -341,9 +369,10 @@ define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX11-TRUE16: ; %bb.0: ; %main_body ; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; GFX11-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX11-TRUE16-NEXT: image_sample v[0:3], v[2:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -363,9 +392,11 @@ define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-TRUE16: ; %bb.0: ; %main_body ; GFX12-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l ; GFX12-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX12-TRUE16-NEXT: image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16 +; GFX12-TRUE16-NEXT: image_sample v[0:3], [v0, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; @@ -402,23 +433,45 @@ define amdgpu_ps <4 x float> @sample_c_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: sample_c_1d: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_mov_b32 s12, exec_lo -; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX11-NEXT: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: ; return to shader part epilog -; -; GFX12-LABEL: sample_c_1d: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_mov_b32 s12, exec_lo -; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX12-NEXT: image_sample_c v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: sample_c_1d: +; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo +; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX11-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX11-TRUE16-NEXT: image_sample_c v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: sample_c_1d: +; GFX11-FAKE16: ; %bb.0: ; %main_body +; GFX11-FAKE16-NEXT: s_mov_b32 s12, exec_lo +; GFX11-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-FAKE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX11-FAKE16-NEXT: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-TRUE16-LABEL: sample_c_1d: +; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: s_mov_b32 s12, exec_lo +; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX12-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX12-TRUE16-NEXT: image_sample_c v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-FAKE16-LABEL: sample_c_1d: +; GFX12-FAKE16: ; %bb.0: ; %main_body +; GFX12-FAKE16-NEXT: s_mov_b32 s12, exec_lo +; GFX12-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-FAKE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX12-FAKE16-NEXT: image_sample_c v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f16(i32 15, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -582,9 +635,10 @@ define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX11-TRUE16: ; %bb.0: ; %main_body ; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; GFX11-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX11-TRUE16-NEXT: image_sample_cl v[0:3], v[2:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -604,9 +658,11 @@ define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX12-TRUE16: ; %bb.0: ; %main_body ; GFX12-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l ; GFX12-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX12-TRUE16-NEXT: image_sample_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX12-TRUE16-NEXT: image_sample_cl v[0:3], [v0, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; @@ -718,9 +774,11 @@ define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX11-TRUE16: ; %bb.0: ; %main_body ; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l ; GFX11-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX11-TRUE16-NEXT: image_sample_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX11-TRUE16-NEXT: image_sample_c_cl v[0:3], [v0, v1, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: ; return to shader part epilog ; @@ -738,9 +796,11 @@ define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-TRUE16: ; %bb.0: ; %main_body ; GFX12-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l ; GFX12-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX12-TRUE16-NEXT: image_sample_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX12-TRUE16-NEXT: image_sample_c_cl v[0:3], [v0, v1, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; @@ -777,23 +837,49 @@ define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: sample_b_1d: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_mov_b32 s12, exec_lo -; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX11-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: ; return to shader part epilog -; -; GFX12-LABEL: sample_b_1d: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_mov_b32 s12, exec_lo -; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX12-NEXT: image_sample_b v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: sample_b_1d: +; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo +; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX11-TRUE16-NEXT: image_sample_b v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: sample_b_1d: +; GFX11-FAKE16: ; %bb.0: ; %main_body +; GFX11-FAKE16-NEXT: s_mov_b32 s12, exec_lo +; GFX11-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-FAKE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX11-FAKE16-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-TRUE16-LABEL: sample_b_1d: +; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: s_mov_b32 s12, exec_lo +; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX12-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX12-TRUE16-NEXT: image_sample_b v[0:3], [v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-FAKE16-LABEL: sample_b_1d: +; GFX12-FAKE16: ; %bb.0: ; %main_body +; GFX12-FAKE16-NEXT: s_mov_b32 s12, exec_lo +; GFX12-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-FAKE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX12-FAKE16-NEXT: image_sample_b v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f16.f16(i32 15, half %bias, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -825,9 +911,12 @@ define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX11-TRUE16: ; %bb.0: ; %main_body ; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l ; GFX11-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX11-TRUE16-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX11-TRUE16-NEXT: image_sample_b v[0:3], v[2:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: ; return to shader part epilog ; @@ -846,8 +935,10 @@ define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l ; GFX12-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX12-TRUE16-NEXT: image_sample_b v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX12-TRUE16-NEXT: image_sample_b v[0:3], [v2, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; @@ -884,23 +975,49 @@ define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: sample_c_b_1d: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_mov_b32 s12, exec_lo -; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX11-NEXT: image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: ; return to shader part epilog -; -; GFX12-LABEL: sample_c_b_1d: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_mov_b32 s12, exec_lo -; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX12-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX12-NEXT: image_sample_c_b v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: sample_c_b_1d: +; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo +; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX11-TRUE16-NEXT: image_sample_c_b v[0:3], [v2, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: sample_c_b_1d: +; GFX11-FAKE16: ; %bb.0: ; %main_body +; GFX11-FAKE16-NEXT: s_mov_b32 s12, exec_lo +; GFX11-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-FAKE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX11-FAKE16-NEXT: image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-TRUE16-LABEL: sample_c_b_1d: +; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: s_mov_b32 s12, exec_lo +; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX12-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX12-TRUE16-NEXT: image_sample_c_b v[0:3], [v2, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-FAKE16-LABEL: sample_c_b_1d: +; GFX12-FAKE16: ; %bb.0: ; %main_body +; GFX12-FAKE16-NEXT: s_mov_b32 s12, exec_lo +; GFX12-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-FAKE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX12-FAKE16-NEXT: image_sample_c_b v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -933,8 +1050,10 @@ define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l ; GFX11-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX11-TRUE16-NEXT: image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX11-TRUE16-NEXT: image_sample_c_b v[0:3], [v3, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: ; return to shader part epilog ; @@ -953,8 +1072,10 @@ define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX12-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l ; GFX12-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX12-TRUE16-NEXT: image_sample_c_b v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX12-TRUE16-NEXT: image_sample_c_b v[0:3], [v3, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; @@ -998,9 +1119,12 @@ define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX11-TRUE16: ; %bb.0: ; %main_body ; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l ; GFX11-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX11-TRUE16-NEXT: image_sample_b_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX11-TRUE16-NEXT: image_sample_b_cl v[0:3], v[2:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: ; return to shader part epilog ; @@ -1019,8 +1143,10 @@ define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l ; GFX12-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX12-TRUE16-NEXT: image_sample_b_cl v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX12-TRUE16-NEXT: image_sample_b_cl v[0:3], [v2, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; @@ -1066,9 +1192,11 @@ define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX11-TRUE16: ; %bb.0: ; %main_body ; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l ; GFX11-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX11-TRUE16-NEXT: image_sample_b_cl v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 @@ -1089,9 +1217,13 @@ define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-TRUE16: ; %bb.0: ; %main_body ; GFX12-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l ; GFX12-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX12-TRUE16-NEXT: image_sample_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX12-TRUE16-NEXT: image_sample_b_cl v[0:3], [v2, v1, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; @@ -1136,8 +1268,10 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l ; GFX11-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX11-TRUE16-NEXT: image_sample_c_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX11-TRUE16-NEXT: image_sample_c_b_cl v[0:3], [v3, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: ; return to shader part epilog ; @@ -1156,8 +1290,10 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l ; GFX12-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX12-TRUE16-NEXT: image_sample_c_b_cl v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX12-TRUE16-NEXT: image_sample_c_b_cl v[0:3], [v3, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; @@ -1204,9 +1340,13 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX11-TRUE16: ; %bb.0: ; %main_body ; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l ; GFX11-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX11-TRUE16-NEXT: image_sample_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX11-TRUE16-NEXT: image_sample_c_b_cl v[0:3], [v3, v1, v2, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: ; return to shader part epilog ; @@ -1224,9 +1364,13 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-TRUE16: ; %bb.0: ; %main_body ; GFX12-TRUE16-NEXT: s_mov_b32 s12, exec_lo ; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr5 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l ; GFX12-TRUE16-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX12-TRUE16-NEXT: image_sample_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX12-TRUE16-NEXT: image_sample_c_b_cl v[0:3], [v3, v1, v2, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; @@ -1257,17 +1401,41 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: sample_d_1d: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: ; return to shader part epilog -; -; GFX12-LABEL: sample_d_1d: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: sample_d_1d: +; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: image_sample_d_g16 v[0:3], v[1:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: sample_d_1d: +; GFX11-FAKE16: ; %bb.0: ; %main_body +; GFX11-FAKE16-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-TRUE16-LABEL: sample_d_1d: +; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX12-TRUE16-NEXT: image_sample_d_g16 v[0:3], [v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-FAKE16-LABEL: sample_d_1d: +; GFX12-FAKE16: ; %bb.0: ; %main_body +; GFX12-FAKE16-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -1364,12 +1532,17 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; ; GFX11-TRUE16-LABEL: sample_d_3d: ; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l -; GFX11-TRUE16-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v3, v5, v[8:9]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 +; GFX11-TRUE16-NEXT: image_sample_d_g16 v[0:3], [v0, v4, v3, v6, v[8:9]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: ; return to shader part epilog ; @@ -1384,13 +1557,16 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; ; GFX12-TRUE16-LABEL: sample_d_3d: ; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr9 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v4.l +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v4.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l -; GFX12-TRUE16-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v3, v[7:9]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 +; GFX12-TRUE16-NEXT: image_sample_d_g16 v[0:3], [v0, v4, v3, v[7:9]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; @@ -1422,17 +1598,41 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: sample_c_d_1d: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: ; return to shader part epilog -; -; GFX12-LABEL: sample_c_d_1d: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: sample_c_d_1d: +; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX11-TRUE16-NEXT: image_sample_c_d_g16 v[0:3], [v0, v2, v3, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: sample_c_d_1d: +; GFX11-FAKE16: ; %bb.0: ; %main_body +; GFX11-FAKE16-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-TRUE16-LABEL: sample_c_d_1d: +; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX12-TRUE16-NEXT: image_sample_c_d_g16 v[0:3], [v0, v2, v3, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-FAKE16-LABEL: sample_c_d_1d: +; GFX12-FAKE16: ; %bb.0: ; %main_body +; GFX12-FAKE16-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -1518,8 +1718,13 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; ; GFX11-TRUE16-LABEL: sample_d_cl_1d: ; GFX11-TRUE16: ; %bb.0: ; %main_body -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l -; GFX11-TRUE16-NEXT: image_sample_d_cl_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v3.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: image_sample_d_cl_g16 v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: ; return to shader part epilog ; @@ -1533,7 +1738,11 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX12-TRUE16-LABEL: sample_d_cl_1d: ; GFX12-TRUE16: ; %bb.0: ; %main_body ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l -; GFX12-TRUE16-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX12-TRUE16-NEXT: image_sample_d_cl_g16 v[0:3], [v1, v3, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; @@ -1570,9 +1779,10 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; ; GFX11-TRUE16-LABEL: sample_d_cl_2d: ; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v5.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v5.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l @@ -1592,10 +1802,12 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; ; GFX12-TRUE16-LABEL: sample_d_cl_2d: ; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr7 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v5.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l -; GFX12-TRUE16-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX12-TRUE16-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; @@ -1631,7 +1843,11 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX11-TRUE16-LABEL: sample_c_d_cl_1d: ; GFX11-TRUE16: ; %bb.0: ; %main_body ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v4.l -; GFX11-TRUE16-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX11-TRUE16-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v2, v4, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: ; return to shader part epilog ; @@ -1645,7 +1861,11 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX12-TRUE16-LABEL: sample_c_d_cl_1d: ; GFX12-TRUE16: ; %bb.0: ; %main_body ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v4.l -; GFX12-TRUE16-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX12-TRUE16-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v2, v4, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; @@ -1684,10 +1904,12 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; ; GFX11-TRUE16-LABEL: sample_c_d_cl_2d: ; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l -; GFX11-TRUE16-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX11-TRUE16-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v5, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: ; return to shader part epilog ; @@ -1702,9 +1924,10 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; ; GFX12-TRUE16-LABEL: sample_c_d_cl_2d: ; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr8 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v6.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v6.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v4.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l ; GFX12-TRUE16-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v[7:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 @@ -1790,9 +2013,10 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; ; GFX11-TRUE16-LABEL: sample_l_2d: ; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; GFX11-TRUE16-NEXT: image_sample_l v[0:3], v[2:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: ; return to shader part epilog @@ -1806,8 +2030,10 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; ; GFX12-TRUE16-LABEL: sample_l_2d: ; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l -; GFX12-TRUE16-NEXT: image_sample_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX12-TRUE16-NEXT: image_sample_l v[0:3], [v0, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; @@ -1890,8 +2116,10 @@ define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; ; GFX11-TRUE16-LABEL: sample_c_l_2d: ; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l -; GFX11-TRUE16-NEXT: image_sample_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l +; GFX11-TRUE16-NEXT: image_sample_c_l v[0:3], [v0, v1, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: ; return to shader part epilog ; @@ -1904,8 +2132,10 @@ define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; ; GFX12-TRUE16-LABEL: sample_c_l_2d: ; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l -; GFX12-TRUE16-NEXT: image_sample_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l +; GFX12-TRUE16-NEXT: image_sample_c_l v[0:3], [v0, v1, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; @@ -1933,17 +2163,33 @@ define amdgpu_ps <4 x float> @sample_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: sample_lz_1d: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: ; return to shader part epilog -; -; GFX12-LABEL: sample_lz_1d: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: sample_lz_1d: +; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: image_sample_lz v[0:3], v1, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: sample_lz_1d: +; GFX11-FAKE16: ; %bb.0: ; %main_body +; GFX11-FAKE16-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-TRUE16-LABEL: sample_lz_1d: +; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX12-TRUE16-NEXT: image_sample_lz v[0:3], v1, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-FAKE16-LABEL: sample_lz_1d: +; GFX12-FAKE16: ; %bb.0: ; %main_body +; GFX12-FAKE16-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f16(i32 15, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -2010,17 +2256,33 @@ define amdgpu_ps <4 x float> @sample_c_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: sample_c_lz_1d: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: ; return to shader part epilog -; -; GFX12-LABEL: sample_c_lz_1d: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: image_sample_c_lz v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: sample_c_lz_1d: +; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX11-TRUE16-NEXT: image_sample_c_lz v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: sample_c_lz_1d: +; GFX11-FAKE16: ; %bb.0: ; %main_body +; GFX11-FAKE16-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-TRUE16-LABEL: sample_c_lz_1d: +; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX12-TRUE16-NEXT: image_sample_c_lz v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-FAKE16-LABEL: sample_c_lz_1d: +; GFX12-FAKE16: ; %bb.0: ; %main_body +; GFX12-FAKE16-NEXT: image_sample_c_lz v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f16(i32 15, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -2102,9 +2364,10 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> ; ; GFX11-TRUE16-LABEL: sample_c_d_o_2darray_V1: ; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v5.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l ; GFX11-TRUE16-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v4, v[8:9]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 @@ -2122,9 +2385,10 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> ; ; GFX12-TRUE16-LABEL: sample_c_d_o_2darray_V1: ; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr9 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v5.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l @@ -2173,9 +2437,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 ; ; GFX11-TRUE16-LABEL: sample_c_d_o_2darray_V2: ; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v5.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l ; GFX11-TRUE16-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v4, v[8:9]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 @@ -2193,9 +2458,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 ; ; GFX12-TRUE16-LABEL: sample_c_d_o_2darray_V2: ; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr9 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v5.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l @@ -2266,3 +2532,6 @@ declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f16(i32, i attributes #0 = { nounwind } attributes #1 = { nounwind readonly } attributes #2 = { nounwind readnone } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11: {{.*}} +; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll index e7b048dda1c1f..ad31829ab059d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll @@ -12,17 +12,37 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: sample_d_1d: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xe4,0xf0,0x00,0x00,0x00,0x08] -; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] -; GFX11-NEXT: ; return to shader part epilog -; -; GFX12-LABEL: sample_d_1d: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x40,0xce,0xe7,0x00,0x00,0x00,0x04,0x00,0x01,0x02,0x00] -; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf] -; GFX12-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: sample_d_1d: +; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l ; encoding: [0x01,0x39,0x06,0x7e] +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l ; encoding: [0x00,0x39,0x02,0x7e] +; GFX11-TRUE16-NEXT: image_sample_d_g16 v[0:3], [v1, v3, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xe4,0xf0,0x01,0x00,0x00,0x08,0x03,0x02,0x00,0x00] +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: sample_d_1d: +; GFX11-FAKE16: ; %bb.0: ; %main_body +; GFX11-FAKE16-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xe4,0xf0,0x00,0x00,0x00,0x08] +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-TRUE16-LABEL: sample_d_1d: +; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l ; encoding: [0x01,0x39,0x06,0x7e] +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l ; encoding: [0x00,0x39,0x02,0x7e] +; GFX12-TRUE16-NEXT: image_sample_d_g16 v[0:3], [v1, v3, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x40,0xce,0xe7,0x00,0x00,0x00,0x04,0x01,0x03,0x02,0x00] +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf] +; GFX12-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-FAKE16-LABEL: sample_d_1d: +; GFX12-FAKE16: ; %bb.0: ; %main_body +; GFX12-FAKE16-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x40,0xce,0xe7,0x00,0x00,0x00,0x04,0x00,0x01,0x02,0x00] +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf] +; GFX12-FAKE16-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -86,9 +106,13 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; ; GFX11-TRUE16-LABEL: sample_d_3d: ; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v4.l ; encoding: [0x04,0x39,0x06,0x7f] +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v5.l ; encoding: [0x05,0x39,0x12,0x7e] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; encoding: [0x02,0x39,0x08,0x7e] ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l ; encoding: [0x01,0x39,0x00,0x7f] -; GFX11-TRUE16-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v3, v5, v[6:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x09,0x0f,0xe4,0xf0,0x00,0x00,0x00,0x08,0x02,0x03,0x05,0x06] +; GFX11-TRUE16-NEXT: image_sample_d_g16 v[0:3], [v0, v4, v3, v9, v[6:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x09,0x0f,0xe4,0xf0,0x00,0x00,0x00,0x08,0x04,0x03,0x09,0x06] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-TRUE16-NEXT: ; return to shader part epilog ; @@ -103,8 +127,10 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-TRUE16-LABEL: sample_d_3d: ; GFX12-TRUE16: ; %bb.0: ; %main_body ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v4.l ; encoding: [0x04,0x39,0x06,0x7f] +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l ; encoding: [0x01,0x39,0x00,0x7f] -; GFX12-TRUE16-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v3, v[5:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x02,0x40,0xce,0xe7,0x00,0x00,0x00,0x04,0x00,0x02,0x03,0x05] +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; encoding: [0x02,0x39,0x08,0x7e] +; GFX12-TRUE16-NEXT: image_sample_d_g16 v[0:3], [v0, v4, v3, v[5:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x02,0x40,0xce,0xe7,0x00,0x00,0x00,0x04,0x00,0x04,0x03,0x05] ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf] ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; @@ -127,17 +153,37 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: sample_c_d_1d: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xe8,0xf0,0x00,0x00,0x00,0x08] -; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] -; GFX11-NEXT: ; return to shader part epilog -; -; GFX12-LABEL: sample_c_d_1d: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x80,0xce,0xe7,0x00,0x00,0x00,0x04,0x00,0x01,0x02,0x03] -; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf] -; GFX12-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: sample_c_d_1d: +; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; encoding: [0x02,0x39,0x08,0x7e] +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l ; encoding: [0x01,0x39,0x04,0x7e] +; GFX11-TRUE16-NEXT: image_sample_c_d_g16 v[0:3], [v0, v2, v4, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xe8,0xf0,0x00,0x00,0x00,0x08,0x02,0x04,0x03,0x00] +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: sample_c_d_1d: +; GFX11-FAKE16: ; %bb.0: ; %main_body +; GFX11-FAKE16-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xe8,0xf0,0x00,0x00,0x00,0x08] +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-TRUE16-LABEL: sample_c_d_1d: +; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; encoding: [0x02,0x39,0x08,0x7e] +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l ; encoding: [0x01,0x39,0x04,0x7e] +; GFX12-TRUE16-NEXT: image_sample_c_d_g16 v[0:3], [v0, v2, v4, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x80,0xce,0xe7,0x00,0x00,0x00,0x04,0x00,0x02,0x04,0x03] +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf] +; GFX12-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-FAKE16-LABEL: sample_c_d_1d: +; GFX12-FAKE16: ; %bb.0: ; %main_body +; GFX12-FAKE16-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x80,0xce,0xe7,0x00,0x00,0x00,0x04,0x00,0x01,0x02,0x03] +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf] +; GFX12-FAKE16-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -195,17 +241,37 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: sample_d_cl_1d: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x7c,0xf1,0x00,0x00,0x00,0x08] -; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] -; GFX11-NEXT: ; return to shader part epilog -; -; GFX12-LABEL: sample_d_cl_1d: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0xc0,0xd7,0xe7,0x00,0x00,0x00,0x04,0x00,0x01,0x02,0x03] -; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf] -; GFX12-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: sample_d_cl_1d: +; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l ; encoding: [0x01,0x39,0x08,0x7e] +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l ; encoding: [0x00,0x39,0x02,0x7e] +; GFX11-TRUE16-NEXT: image_sample_d_cl_g16 v[0:3], [v1, v4, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0x7c,0xf1,0x01,0x00,0x00,0x08,0x04,0x02,0x03,0x00] +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: sample_d_cl_1d: +; GFX11-FAKE16: ; %bb.0: ; %main_body +; GFX11-FAKE16-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x7c,0xf1,0x00,0x00,0x00,0x08] +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-TRUE16-LABEL: sample_d_cl_1d: +; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l ; encoding: [0x01,0x39,0x08,0x7e] +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l ; encoding: [0x00,0x39,0x02,0x7e] +; GFX12-TRUE16-NEXT: image_sample_d_cl_g16 v[0:3], [v1, v4, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0xc0,0xd7,0xe7,0x00,0x00,0x00,0x04,0x01,0x04,0x02,0x03] +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf] +; GFX12-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-FAKE16-LABEL: sample_d_cl_1d: +; GFX12-FAKE16: ; %bb.0: ; %main_body +; GFX12-FAKE16-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0xc0,0xd7,0xe7,0x00,0x00,0x00,0x04,0x00,0x01,0x02,0x03] +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf] +; GFX12-FAKE16-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -263,17 +329,37 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: sample_c_d_cl_1d: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x50,0xf1,0x00,0x00,0x00,0x08] -; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] -; GFX11-NEXT: ; return to shader part epilog -; -; GFX12-LABEL: sample_c_d_cl_1d: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v[3:4]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x00,0xd5,0xe7,0x00,0x00,0x00,0x04,0x00,0x01,0x02,0x03] -; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf] -; GFX12-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: sample_c_d_cl_1d: +; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l ; encoding: [0x02,0x39,0x0a,0x7e] +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l ; encoding: [0x01,0x39,0x04,0x7e] +; GFX11-TRUE16-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v2, v5, v3, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0x50,0xf1,0x00,0x00,0x00,0x08,0x02,0x05,0x03,0x04] +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: sample_c_d_cl_1d: +; GFX11-FAKE16: ; %bb.0: ; %main_body +; GFX11-FAKE16-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x50,0xf1,0x00,0x00,0x00,0x08] +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-TRUE16-LABEL: sample_c_d_cl_1d: +; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr5 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l ; encoding: [0x02,0x39,0x0a,0x7e] +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l ; encoding: [0x01,0x39,0x04,0x7e] +; GFX12-TRUE16-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v2, v5, v[3:4]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x00,0xd5,0xe7,0x00,0x00,0x00,0x04,0x00,0x02,0x05,0x03] +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf] +; GFX12-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-FAKE16-LABEL: sample_c_d_cl_1d: +; GFX12-FAKE16: ; %bb.0: ; %main_body +; GFX12-FAKE16-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v[3:4]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x00,0xd5,0xe7,0x00,0x00,0x00,0x04,0x00,0x01,0x02,0x03] +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf] +; GFX12-FAKE16-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -446,3 +532,6 @@ declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32, i attributes #0 = { nounwind } attributes #1 = { nounwind readonly } attributes #2 = { nounwind readnone } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11: {{.*}} +; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll index 45cebaf449d54..428f7d65c12a4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll @@ -12,17 +12,37 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: sample_d_1d: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: ; return to shader part epilog -; -; GFX12-LABEL: sample_d_1d: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: sample_d_1d: +; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: image_sample_d_g16 v[0:3], [v1, v3, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: sample_d_1d: +; GFX11-FAKE16: ; %bb.0: ; %main_body +; GFX11-FAKE16-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-TRUE16-LABEL: sample_d_1d: +; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX12-TRUE16-NEXT: image_sample_d_g16 v[0:3], [v1, v3, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-FAKE16-LABEL: sample_d_1d: +; GFX12-FAKE16: ; %bb.0: ; %main_body +; GFX12-FAKE16-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -86,9 +106,13 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; ; GFX11-TRUE16-LABEL: sample_d_3d: ; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v4.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l -; GFX11-TRUE16-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v3, v5, v[6:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D +; GFX11-TRUE16-NEXT: image_sample_d_g16 v[0:3], [v0, v4, v3, v9, v[6:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: ; return to shader part epilog ; @@ -103,8 +127,10 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX12-TRUE16-LABEL: sample_d_3d: ; GFX12-TRUE16: ; %bb.0: ; %main_body ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v4.l +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l -; GFX12-TRUE16-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v3, v[5:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX12-TRUE16-NEXT: image_sample_d_g16 v[0:3], [v0, v4, v3, v[5:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: ; return to shader part epilog ; @@ -127,17 +153,37 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: sample_c_d_1d: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: ; return to shader part epilog -; -; GFX12-LABEL: sample_c_d_1d: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: sample_c_d_1d: +; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX11-TRUE16-NEXT: image_sample_c_d_g16 v[0:3], [v0, v2, v4, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: sample_c_d_1d: +; GFX11-FAKE16: ; %bb.0: ; %main_body +; GFX11-FAKE16-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-TRUE16-LABEL: sample_c_d_1d: +; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX12-TRUE16-NEXT: image_sample_c_d_g16 v[0:3], [v0, v2, v4, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-FAKE16-LABEL: sample_c_d_1d: +; GFX12-FAKE16: ; %bb.0: ; %main_body +; GFX12-FAKE16-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -195,17 +241,37 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: sample_d_cl_1d: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: ; return to shader part epilog -; -; GFX12-LABEL: sample_d_cl_1d: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: sample_d_cl_1d: +; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: image_sample_d_cl_g16 v[0:3], [v1, v4, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: sample_d_cl_1d: +; GFX11-FAKE16: ; %bb.0: ; %main_body +; GFX11-FAKE16-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-TRUE16-LABEL: sample_d_cl_1d: +; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX12-TRUE16-NEXT: image_sample_d_cl_g16 v[0:3], [v1, v4, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-FAKE16-LABEL: sample_d_cl_1d: +; GFX12-FAKE16: ; %bb.0: ; %main_body +; GFX12-FAKE16-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -263,17 +329,37 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: sample_c_d_cl_1d: -; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: ; return to shader part epilog -; -; GFX12-LABEL: sample_c_d_cl_1d: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v[3:4]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: sample_c_d_cl_1d: +; GFX11-TRUE16: ; %bb.0: ; %main_body +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX11-TRUE16-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v2, v5, v3, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: sample_c_d_cl_1d: +; GFX11-FAKE16: ; %bb.0: ; %main_body +; GFX11-FAKE16-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-TRUE16-LABEL: sample_c_d_cl_1d: +; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr5 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX12-TRUE16-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v2, v5, v[3:4]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-FAKE16-LABEL: sample_c_d_cl_1d: +; GFX12-FAKE16: ; %bb.0: ; %main_body +; GFX12-FAKE16-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v[3:4]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret <4 x float> %v @@ -444,3 +530,6 @@ declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32, i attributes #0 = { nounwind } attributes #1 = { nounwind readonly } attributes #2 = { nounwind readnone } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11: {{.*}} +; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.noret.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.noret.ll index 4873b42a235e3..2d9189d570937 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.noret.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.noret.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-SDAG,GFX10,GFX10-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-GISEL,GFX10,GFX10-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-SDAG,GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-SDAG,GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,GFX10-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,GFX10-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s ; FIXME-TRUE16. enable gisel -; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-GISEL,GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-GISEL,GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s +; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s ; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s @@ -296,29 +296,53 @@ main_body: } define amdgpu_ps <4 x float> @sample_nortn_mix_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { -; GFX10PLUS-SDAG-LABEL: sample_nortn_mix_3: -; GFX10PLUS-SDAG: ; %bb.0: ; %main_body -; GFX10PLUS-SDAG-NEXT: s_mov_b32 s12, exec_lo -; GFX10PLUS-SDAG-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10PLUS-SDAG-NEXT: image_sample v1, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D -; GFX10PLUS-SDAG-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX10PLUS-SDAG-NEXT: image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10PLUS-SDAG-NEXT: s_waitcnt vmcnt(1) -; GFX10PLUS-SDAG-NEXT: image_sample v[0:3], v1, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10PLUS-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10PLUS-SDAG-NEXT: ; return to shader part epilog -; -; GFX10PLUS-GISEL-LABEL: sample_nortn_mix_3: -; GFX10PLUS-GISEL: ; %bb.0: ; %main_body -; GFX10PLUS-GISEL-NEXT: s_mov_b32 s12, exec_lo -; GFX10PLUS-GISEL-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10PLUS-GISEL-NEXT: image_sample v[1:4], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10PLUS-GISEL-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX10PLUS-GISEL-NEXT: image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10PLUS-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX10PLUS-GISEL-NEXT: image_sample v[0:3], v1, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10PLUS-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10PLUS-GISEL-NEXT: ; return to shader part epilog +; GFX10-SDAG-LABEL: sample_nortn_mix_3: +; GFX10-SDAG: ; %bb.0: ; %main_body +; GFX10-SDAG-NEXT: s_mov_b32 s12, exec_lo +; GFX10-SDAG-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-SDAG-NEXT: image_sample v1, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-SDAG-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-SDAG-NEXT: image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX10-SDAG-NEXT: image_sample v[0:3], v1, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: ; return to shader part epilog +; +; GFX10-GISEL-LABEL: sample_nortn_mix_3: +; GFX10-GISEL: ; %bb.0: ; %main_body +; GFX10-GISEL-NEXT: s_mov_b32 s12, exec_lo +; GFX10-GISEL-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-GISEL-NEXT: image_sample v[1:4], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-GISEL-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-GISEL-NEXT: image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX10-GISEL-NEXT: image_sample v[0:3], v1, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: ; return to shader part epilog +; +; GFX11-SDAG-LABEL: sample_nortn_mix_3: +; GFX11-SDAG: ; %bb.0: ; %main_body +; GFX11-SDAG-NEXT: s_mov_b32 s12, exec_lo +; GFX11-SDAG-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-SDAG-NEXT: image_sample v1, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX11-SDAG-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX11-SDAG-NEXT: image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX11-SDAG-NEXT: image_sample v[0:3], v1, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: ; return to shader part epilog +; +; GFX11-GISEL-LABEL: sample_nortn_mix_3: +; GFX11-GISEL: ; %bb.0: ; %main_body +; GFX11-GISEL-NEXT: s_mov_b32 s12, exec_lo +; GFX11-GISEL-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-GISEL-NEXT: image_sample v[1:4], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX11-GISEL-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX11-GISEL-NEXT: image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX11-GISEL-NEXT: image_sample v[0:3], v1, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: ; return to shader part epilog ; ; GFX12-SDAG-LABEL: sample_nortn_mix_3: ; GFX12-SDAG: ; %bb.0: ; %main_body @@ -352,39 +376,73 @@ main_body: } define amdgpu_ps <4 x float> @sample_nortn_mix_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { -; GFX10PLUS-SDAG-LABEL: sample_nortn_mix_4: -; GFX10PLUS-SDAG: ; %bb.0: ; %main_body -; GFX10PLUS-SDAG-NEXT: s_mov_b32 s12, exec_lo -; GFX10PLUS-SDAG-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10PLUS-SDAG-NEXT: image_sample v4, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D -; GFX10PLUS-SDAG-NEXT: image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10PLUS-SDAG-NEXT: s_waitcnt vmcnt(1) -; GFX10PLUS-SDAG-NEXT: image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10PLUS-SDAG-NEXT: image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10PLUS-SDAG-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX10PLUS-SDAG-NEXT: image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10PLUS-SDAG-NEXT: image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10PLUS-SDAG-NEXT: s_waitcnt vmcnt(2) -; GFX10PLUS-SDAG-NEXT: image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10PLUS-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10PLUS-SDAG-NEXT: ; return to shader part epilog -; -; GFX10PLUS-GISEL-LABEL: sample_nortn_mix_4: -; GFX10PLUS-GISEL: ; %bb.0: ; %main_body -; GFX10PLUS-GISEL-NEXT: s_mov_b32 s12, exec_lo -; GFX10PLUS-GISEL-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10PLUS-GISEL-NEXT: image_sample v[4:7], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10PLUS-GISEL-NEXT: image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10PLUS-GISEL-NEXT: s_waitcnt vmcnt(1) -; GFX10PLUS-GISEL-NEXT: image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10PLUS-GISEL-NEXT: image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10PLUS-GISEL-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX10PLUS-GISEL-NEXT: image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10PLUS-GISEL-NEXT: image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10PLUS-GISEL-NEXT: s_waitcnt vmcnt(2) -; GFX10PLUS-GISEL-NEXT: image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10PLUS-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10PLUS-GISEL-NEXT: ; return to shader part epilog +; GFX10-SDAG-LABEL: sample_nortn_mix_4: +; GFX10-SDAG: ; %bb.0: ; %main_body +; GFX10-SDAG-NEXT: s_mov_b32 s12, exec_lo +; GFX10-SDAG-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-SDAG-NEXT: image_sample v4, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-SDAG-NEXT: image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX10-SDAG-NEXT: image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-SDAG-NEXT: image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-SDAG-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-SDAG-NEXT: image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-SDAG-NEXT: image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(2) +; GFX10-SDAG-NEXT: image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: ; return to shader part epilog +; +; GFX10-GISEL-LABEL: sample_nortn_mix_4: +; GFX10-GISEL: ; %bb.0: ; %main_body +; GFX10-GISEL-NEXT: s_mov_b32 s12, exec_lo +; GFX10-GISEL-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-GISEL-NEXT: image_sample v[4:7], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-GISEL-NEXT: image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX10-GISEL-NEXT: image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-GISEL-NEXT: image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-GISEL-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-GISEL-NEXT: image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-GISEL-NEXT: image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(2) +; GFX10-GISEL-NEXT: image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: ; return to shader part epilog +; +; GFX11-SDAG-LABEL: sample_nortn_mix_4: +; GFX11-SDAG: ; %bb.0: ; %main_body +; GFX11-SDAG-NEXT: s_mov_b32 s12, exec_lo +; GFX11-SDAG-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-SDAG-NEXT: image_sample v4, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX11-SDAG-NEXT: image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX11-SDAG-NEXT: image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX11-SDAG-NEXT: image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX11-SDAG-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX11-SDAG-NEXT: image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX11-SDAG-NEXT: image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(2) +; GFX11-SDAG-NEXT: image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: ; return to shader part epilog +; +; GFX11-GISEL-LABEL: sample_nortn_mix_4: +; GFX11-GISEL: ; %bb.0: ; %main_body +; GFX11-GISEL-NEXT: s_mov_b32 s12, exec_lo +; GFX11-GISEL-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-GISEL-NEXT: image_sample v[4:7], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX11-GISEL-NEXT: image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX11-GISEL-NEXT: image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX11-GISEL-NEXT: image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX11-GISEL-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX11-GISEL-NEXT: image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX11-GISEL-NEXT: image_sample off, v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(2) +; GFX11-GISEL-NEXT: image_sample off, v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: ; return to shader part epilog ; ; GFX12-SDAG-LABEL: sample_nortn_mix_4: ; GFX12-SDAG: ; %bb.0: ; %main_body @@ -436,15 +494,48 @@ main_body: } define amdgpu_ps void @sample_d_1d_g16_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { -; GFX10PLUS-LABEL: sample_d_1d_g16_nortn: -; GFX10PLUS: ; %bb.0: ; %main_body -; GFX10PLUS-NEXT: image_sample_d_g16 off, v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10PLUS-NEXT: s_endpgm +; GFX10-LABEL: sample_d_1d_g16_nortn: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: image_sample_d_g16 off, v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-NEXT: s_endpgm ; -; GFX12-LABEL: sample_d_1d_g16_nortn: -; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: image_sample_d_g16 off, [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX12-NEXT: s_endpgm +; GFX11-SDAG-TRUE16-LABEL: sample_d_1d_g16_nortn: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %main_body +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: image_sample_d_g16 off, [v1, v3, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX11-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX11-SDAG-FAKE16-LABEL: sample_d_1d_g16_nortn: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %main_body +; GFX11-SDAG-FAKE16-NEXT: image_sample_d_g16 off, v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX11-SDAG-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: sample_d_1d_g16_nortn: +; GFX11-GISEL: ; %bb.0: ; %main_body +; GFX11-GISEL-NEXT: image_sample_d_g16 off, v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-TRUE16-LABEL: sample_d_1d_g16_nortn: +; GFX12-SDAG-TRUE16: ; %bb.0: ; %main_body +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX12-SDAG-TRUE16-NEXT: image_sample_d_g16 off, [v1, v3, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX12-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX12-SDAG-FAKE16-LABEL: sample_d_1d_g16_nortn: +; GFX12-SDAG-FAKE16: ; %bb.0: ; %main_body +; GFX12-SDAG-FAKE16-NEXT: image_sample_d_g16 off, [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX12-SDAG-FAKE16-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: sample_d_1d_g16_nortn: +; GFX12-GISEL: ; %bb.0: ; %main_body +; GFX12-GISEL-NEXT: image_sample_d_g16 off, [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX12-GISEL-NEXT: s_endpgm main_body: call void @llvm.amdgcn.image.sample.d.1d.nortn.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) ret void @@ -476,15 +567,6 @@ declare void @llvm.amdgcn.image.sample.d.1d.nortn.f16.f32(i32, half, half, float attributes #0 = { nounwind } attributes #1 = { nounwind readonly } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10: {{.*}} -; GFX10-GISEL: {{.*}} -; GFX10-SDAG: {{.*}} ; GFX11: {{.*}} -; GFX11-GISEL: {{.*}} ; GFX11-GISEL-FAKE16: {{.*}} -; GFX11-SDAG: {{.*}} -; GFX11-SDAG-FAKE16: {{.*}} -; GFX11-SDAG-TRUE16: {{.*}} ; GFX12-GISEL-FAKE16: {{.*}} -; GFX12-SDAG-FAKE16: {{.*}} -; GFX12-SDAG-TRUE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll index e7d8683137dd5..d69406e01da56 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll @@ -1,186 +1,754 @@ -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10PLUS %s -; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10PLUS %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefix=GFX10PLUS %s -; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefix=GFX10PLUS %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefix=GFX10PLUS %s -; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefix=GFX10PLUS %s - -; GFX10PLUS-LABEL: {{^}}dpp8_test: -; GFX10PLUS: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX10PLUS: v_mov_b32_dpp [[SRC]], [[SRC]] dpp8:[1,0,0,0,0,0,0,0]{{$}} +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,GFX10-SDAG %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,GFX10-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-SDAG %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX12,GFX12-SDAG %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX12,GFX12-GISEL %s + define amdgpu_kernel void @dpp8_test(ptr addrspace(1) %out, i32 %in) { +; GFX10-LABEL: dpp8_test: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp8_test: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: dpp8_test: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_endpgm %tmp0 = call i32 @llvm.amdgcn.mov.dpp8.i32(i32 %in, i32 1) #0 store i32 %tmp0, ptr addrspace(1) %out ret void } -; GFX10PLUS-LABEL: {{^}}dpp8_wait_states: -; GFX10PLUS-NOOPT: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s{{[0-9]+}} -; GFX10PLUS: v_mov_b32_e32 [[VGPR0:v[0-9]+]], s{{[0-9]+}} -; GFX10PLUS: v_mov_b32_dpp [[VGPR0]], [[VGPR0]] dpp8:[1,0,0,0,0,0,0,0]{{$}} -; GFX10PLUS: v_mov_b32_dpp [[VGPR0]], [[VGPR0]] dpp8:[5,0,0,0,0,0,0,0]{{$}} define amdgpu_kernel void @dpp8_wait_states(ptr addrspace(1) %out, i32 %in) { +; GFX10-LABEL: dpp8_wait_states: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-NEXT: v_mov_b32_dpp v0, v0 dpp8:[5,0,0,0,0,0,0,0] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp8_wait_states: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-NEXT: v_mov_b32_dpp v0, v0 dpp8:[5,0,0,0,0,0,0,0] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: dpp8_wait_states: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-NEXT: v_mov_b32_dpp v0, v0 dpp8:[5,0,0,0,0,0,0,0] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_endpgm %tmp0 = call i32 @llvm.amdgcn.mov.dpp8.i32(i32 %in, i32 1) #0 %tmp1 = call i32 @llvm.amdgcn.mov.dpp8.i32(i32 %tmp0, i32 5) #0 store i32 %tmp1, ptr addrspace(1) %out ret void } -; GFX10PLUS-LABEL: {{^}}dpp8_i64: -; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off define amdgpu_ps void @dpp8_i64(i64 %in, ptr addrspace(1) %out) { +; GFX10-SDAG-LABEL: dpp8_i64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: dpp8_i64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-GISEL-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: dpp8_i64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: dpp8_i64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: dpp8_i64: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: dpp8_i64: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX12-GISEL-NEXT: s_endpgm %tmp0 = call i64 @llvm.amdgcn.mov.dpp8.i64(i64 %in, i32 1) store i64 %tmp0, ptr addrspace(1) %out ret void } -; GFX10PLUS-LABEL: {{^}}dpp8_v2i32: -; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off define amdgpu_ps void @dpp8_v2i32(<2 x i32> %in, ptr addrspace(1) %out) { +; GFX10-SDAG-LABEL: dpp8_v2i32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: dpp8_v2i32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-GISEL-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: dpp8_v2i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: dpp8_v2i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: dpp8_v2i32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: dpp8_v2i32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX12-GISEL-NEXT: s_endpgm %tmp0 = call <2 x i32> @llvm.amdgcn.mov.dpp8.v3i32(<2 x i32> %in, i32 1) store <2 x i32> %tmp0, ptr addrspace(1) %out ret void } -; GFX10PLUS-LABEL: {{^}}dpp8_v3i32: -; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: global_store_{{dwordx3|b96}} v[3:4], v[0:2], off define amdgpu_ps void @dpp8_v3i32(<3 x i32> %in, ptr addrspace(1) %out) { +; GFX10-SDAG-LABEL: dpp8_v3i32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-SDAG-NEXT: global_store_dwordx3 v[3:4], v[0:2], off +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: dpp8_v3i32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-GISEL-NEXT: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-GISEL-NEXT: global_store_dwordx3 v[3:4], v[0:2], off +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: dpp8_v3i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-SDAG-NEXT: global_store_b96 v[3:4], v[0:2], off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: dpp8_v3i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-GISEL-NEXT: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-GISEL-NEXT: global_store_b96 v[3:4], v[0:2], off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: dpp8_v3i32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-SDAG-NEXT: global_store_b96 v[3:4], v[0:2], off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: dpp8_v3i32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-GISEL-NEXT: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-GISEL-NEXT: global_store_b96 v[3:4], v[0:2], off +; GFX12-GISEL-NEXT: s_endpgm %tmp0 = call <3 x i32> @llvm.amdgcn.mov.dpp8.v3i32(<3 x i32> %in, i32 1) store <3 x i32> %tmp0, ptr addrspace(1) %out ret void } -; GFX10PLUS-LABEL: {{^}}dpp8_v4i32: -; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: v_mov_b32_dpp v3, v3 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: global_store_{{dwordx4|b128}} v[4:5], v[0:3], off define amdgpu_ps void @dpp8_v4i32(<4 x i32> %in, ptr addrspace(1) %out) { +; GFX10-SDAG-LABEL: dpp8_v4i32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_mov_b32_dpp v3, v3 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-SDAG-NEXT: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: dpp8_v4i32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-GISEL-NEXT: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-GISEL-NEXT: v_mov_b32_dpp v3, v3 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-GISEL-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: dpp8_v4i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_mov_b32_dpp v3, v3 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-SDAG-NEXT: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-SDAG-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: dpp8_v4i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-GISEL-NEXT: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-GISEL-NEXT: v_mov_b32_dpp v3, v3 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-GISEL-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: dpp8_v4i32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_mov_b32_dpp v3, v3 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-SDAG-NEXT: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-SDAG-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: dpp8_v4i32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-GISEL-NEXT: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-GISEL-NEXT: v_mov_b32_dpp v3, v3 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-GISEL-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX12-GISEL-NEXT: s_endpgm %tmp0 = call <4 x i32> @llvm.amdgcn.mov.dpp8.v3i32(<4 x i32> %in, i32 1) store <4 x i32> %tmp0, ptr addrspace(1) %out ret void } -; GFX10PLUS-LABEL: {{^}}dpp8_p0: -; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off define amdgpu_ps void @dpp8_p0(ptr %in, ptr addrspace(1) %out) { +; GFX10-SDAG-LABEL: dpp8_p0: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: dpp8_p0: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-GISEL-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: dpp8_p0: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: dpp8_p0: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: dpp8_p0: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: dpp8_p0: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX12-GISEL-NEXT: s_endpgm %tmp0 = call ptr @llvm.amdgcn.mov.dpp8.p0(ptr %in, i32 1) store ptr %tmp0, ptr addrspace(1) %out ret void } -; GFX10PLUS-LABEL: {{^}}dpp8_p3: -; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: global_store_{{dword|b32}} v[1:2], v0, off define amdgpu_ps void @dpp8_p3(ptr addrspace(3) %in, ptr addrspace(1) %out) { +; GFX10-LABEL: dpp8_p3: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-NEXT: global_store_dword v[1:2], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp8_p3: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-NEXT: global_store_b32 v[1:2], v0, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: dpp8_p3: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-NEXT: global_store_b32 v[1:2], v0, off +; GFX12-NEXT: s_endpgm %tmp0 = call ptr addrspace(3) @llvm.amdgcn.mov.dpp8.v3p3(ptr addrspace(3) %in, i32 1) store ptr addrspace(3) %tmp0, ptr addrspace(1) %out ret void } -; GFX10PLUS-LABEL: {{^}}dpp8_v3p3: -; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: global_store_{{dwordx3|b96}} v[3:4], v[0:2], off define amdgpu_ps void @dpp8_v3p3(<3 x ptr addrspace(3)> %in, ptr addrspace(1) %out) { +; GFX10-SDAG-LABEL: dpp8_v3p3: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-SDAG-NEXT: global_store_dwordx3 v[3:4], v[0:2], off +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: dpp8_v3p3: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-GISEL-NEXT: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-GISEL-NEXT: global_store_dwordx3 v[3:4], v[0:2], off +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: dpp8_v3p3: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-SDAG-NEXT: global_store_b96 v[3:4], v[0:2], off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: dpp8_v3p3: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-GISEL-NEXT: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-GISEL-NEXT: global_store_b96 v[3:4], v[0:2], off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: dpp8_v3p3: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-SDAG-NEXT: global_store_b96 v[3:4], v[0:2], off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: dpp8_v3p3: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-GISEL-NEXT: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-GISEL-NEXT: global_store_b96 v[3:4], v[0:2], off +; GFX12-GISEL-NEXT: s_endpgm %tmp0 = call <3 x ptr addrspace(3)> @llvm.amdgcn.mov.dpp8.v3p3(<3 x ptr addrspace(3)> %in, i32 1) store <3 x ptr addrspace(3)> %tmp0, ptr addrspace(1) %out ret void } -; GFX10PLUS-LABEL: {{^}}dpp8_i16: -; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: global_store_{{short|b16}} v[1:2], v0, off define amdgpu_ps void @dpp8_i16(i16 %in, ptr addrspace(1) %out) { +; GFX10-LABEL: dpp8_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-NEXT: global_store_short v[1:2], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: dpp8_i16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX11-SDAG-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_mov_b32_dpp v3, v3 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-SDAG-NEXT: global_store_b16 v[1:2], v3, off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: dpp8_i16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-GISEL-NEXT: global_store_b16 v[1:2], v0, off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-LABEL: dpp8_i16: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-NEXT: global_store_b16 v[1:2], v0, off +; GFX12-NEXT: s_endpgm %tmp0 = call i16 @llvm.amdgcn.mov.dpp8.i16(i16 %in, i32 1) store i16 %tmp0, ptr addrspace(1) %out ret void } -; GFX10PLUS-LABEL: {{^}}dpp8_v4i16: -; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off define amdgpu_ps void @dpp8_v4i16(<4 x i16> %in, ptr addrspace(1) %out) { +; GFX10-SDAG-LABEL: dpp8_v4i16: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: dpp8_v4i16: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-GISEL-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: dpp8_v4i16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: dpp8_v4i16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: dpp8_v4i16: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: dpp8_v4i16: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX12-GISEL-NEXT: s_endpgm %tmp0 = call <4 x i16> @llvm.amdgcn.mov.dpp8.v4i16(<4 x i16> %in, i32 1) store <4 x i16> %tmp0, ptr addrspace(1) %out ret void } -; GFX10PLUS-LABEL: {{^}}dpp8_v4f16: -; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off define amdgpu_ps void @dpp8_v4f16(<4 x half> %in, ptr addrspace(1) %out) { +; GFX10-SDAG-LABEL: dpp8_v4f16: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: dpp8_v4f16: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-GISEL-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: dpp8_v4f16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: dpp8_v4f16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: dpp8_v4f16: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: dpp8_v4f16: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX12-GISEL-NEXT: s_endpgm %tmp0 = call <4 x half> @llvm.amdgcn.mov.dpp8.v4f16(<4 x half> %in, i32 1) store <4 x half> %tmp0, ptr addrspace(1) %out ret void } -; GFX10PLUS-LABEL: {{^}}dpp8_float: -; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: global_store_{{dword|b32}} v[1:2], v0, off define amdgpu_ps void @dpp8_float(float %in, ptr addrspace(1) %out) { +; GFX10-LABEL: dpp8_float: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-NEXT: global_store_dword v[1:2], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp8_float: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-NEXT: global_store_b32 v[1:2], v0, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: dpp8_float: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-NEXT: global_store_b32 v[1:2], v0, off +; GFX12-NEXT: s_endpgm %tmp0 = call float @llvm.amdgcn.mov.dpp8.f32(float %in, i32 1) store float %tmp0, ptr addrspace(1) %out ret void } -; GFX10PLUS-LABEL: {{^}}dpp8_v3f32: -; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: global_store_{{dwordx3|b96}} v[3:4], v[0:2], off define amdgpu_ps void @dpp8_v3f32(<3 x float> %in, ptr addrspace(1) %out) { +; GFX10-SDAG-LABEL: dpp8_v3f32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-SDAG-NEXT: global_store_dwordx3 v[3:4], v[0:2], off +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: dpp8_v3f32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-GISEL-NEXT: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-GISEL-NEXT: global_store_dwordx3 v[3:4], v[0:2], off +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: dpp8_v3f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-SDAG-NEXT: global_store_b96 v[3:4], v[0:2], off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: dpp8_v3f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-GISEL-NEXT: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-GISEL-NEXT: global_store_b96 v[3:4], v[0:2], off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: dpp8_v3f32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-SDAG-NEXT: global_store_b96 v[3:4], v[0:2], off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: dpp8_v3f32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-GISEL-NEXT: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-GISEL-NEXT: global_store_b96 v[3:4], v[0:2], off +; GFX12-GISEL-NEXT: s_endpgm %tmp0 = call <3 x float> @llvm.amdgcn.mov.dpp8.v3f32(<3 x float> %in, i32 1) store <3 x float> %tmp0, ptr addrspace(1) %out ret void } -; GFX10PLUS-LABEL: {{^}}dpp8_half: -; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: global_store_{{short|b16}} v[1:2], v0, off define amdgpu_ps void @dpp8_half(half %in, ptr addrspace(1) %out) { +; GFX10-LABEL: dpp8_half: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-NEXT: global_store_short v[1:2], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: dpp8_half: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX11-SDAG-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_mov_b32_dpp v3, v3 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-SDAG-NEXT: global_store_b16 v[1:2], v3, off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: dpp8_half: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-GISEL-NEXT: global_store_b16 v[1:2], v0, off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-LABEL: dpp8_half: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-NEXT: global_store_b16 v[1:2], v0, off +; GFX12-NEXT: s_endpgm %tmp0 = call half @llvm.amdgcn.mov.dpp8.f16(half %in, i32 1) store half %tmp0, ptr addrspace(1) %out ret void } -; GFX10PLUS-LABEL: {{^}}dpp8_bfloat: -; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: global_store_{{short|b16}} v[1:2], v0, off define amdgpu_ps void @dpp8_bfloat(bfloat %in, ptr addrspace(1) %out) { +; GFX10-LABEL: dpp8_bfloat: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-NEXT: global_store_short v[1:2], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp8_bfloat: +; GFX11: ; %bb.0: +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_dpp v3, v3 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-NEXT: global_store_b16 v[1:2], v3, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: dpp8_bfloat: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-NEXT: global_store_b16 v[1:2], v0, off +; GFX12-NEXT: s_endpgm %tmp0 = call bfloat @llvm.amdgcn.mov.dpp8.bf16(bfloat %in, i32 1) store bfloat %tmp0, ptr addrspace(1) %out ret void } -; GFX10PLUS-LABEL: {{^}}dpp8_v4bf16: -; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off define amdgpu_ps void @dpp8_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) { +; GFX10-LABEL: dpp8_v4bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp8_v4bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: dpp8_v4bf16: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX12-NEXT: s_endpgm %tmp0 = call <4 x bfloat> @llvm.amdgcn.mov.dpp8.v4bf16(<4 x bfloat> %in, i32 1) store <4 x bfloat> %tmp0, ptr addrspace(1) %out ret void } -; GFX10PLUS-LABEL: {{^}}dpp8_double: -; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] -; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off define amdgpu_ps void @dpp8_double(double %in, ptr addrspace(1) %out) { +; GFX10-SDAG-LABEL: dpp8_double: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: dpp8_double: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10-GISEL-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: dpp8_double: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: dpp8_double: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX11-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: dpp8_double: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-SDAG-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: dpp8_double: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-GISEL-NEXT: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX12-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX12-GISEL-NEXT: s_endpgm %tmp0 = call double @llvm.amdgcn.mov.dpp8.f64(double %in, i32 1) store double %tmp0, ptr addrspace(1) %out ret void @@ -189,3 +757,5 @@ define amdgpu_ps void @dpp8_double(double %in, ptr addrspace(1) %out) { declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #0 attributes #0 = { nounwind readnone convergent } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10PLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll index 4c6095ee594b0..d3ec936d786a3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -8551,15 +8551,27 @@ define void @v_permlane16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i32 ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_permlane16_half: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s0, v3 -; GFX11-NEXT: v_readfirstlane_b32 s1, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-LABEL: v_permlane16_half: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: ; implicit-def: $vgpr5 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-SDAG-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: global_store_b16 v[0:1], v5, off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_permlane16_half: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_permlane16_half: ; GFX12: ; %bb.0: @@ -8590,15 +8602,27 @@ define void @v_permlanex16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i3 ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_permlanex16_half: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s0, v3 -; GFX11-NEXT: v_readfirstlane_b32 s1, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-LABEL: v_permlanex16_half: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: ; implicit-def: $vgpr5 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-SDAG-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: global_store_b16 v[0:1], v5, off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_permlanex16_half: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_permlanex16_half: ; GFX12: ; %bb.0: @@ -8631,12 +8655,14 @@ define void @v_permlane16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1, ; ; GFX11-LABEL: v_permlane16_bfloat: ; GFX11: ; %bb.0: +; GFX11-NEXT: ; implicit-def: $vgpr5 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_mov_b16_e32 v5.l, v2.l ; GFX11-NEXT: v_readfirstlane_b32 s1, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX11-NEXT: global_store_b16 v[0:1], v5, off ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_permlane16_bfloat: @@ -8670,12 +8696,14 @@ define void @v_permlanex16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1 ; ; GFX11-LABEL: v_permlanex16_bfloat: ; GFX11: ; %bb.0: +; GFX11-NEXT: ; implicit-def: $vgpr5 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_mov_b16_e32 v5.l, v2.l ; GFX11-NEXT: v_readfirstlane_b32 s1, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX11-NEXT: global_store_b16 v[0:1], v5, off ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_permlanex16_bfloat: @@ -8707,15 +8735,27 @@ define void @v_permlane16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 % ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_permlane16_i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s0, v3 -; GFX11-NEXT: v_readfirstlane_b32 s1, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1 -; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-LABEL: v_permlane16_i16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: ; implicit-def: $vgpr5 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-SDAG-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: global_store_b16 v[0:1], v5, off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_permlane16_i16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_permlane16_i16: ; GFX12: ; %bb.0: @@ -8746,15 +8786,27 @@ define void @v_permlanex16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_permlanex16_i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_readfirstlane_b32 s0, v3 -; GFX11-NEXT: v_readfirstlane_b32 s1, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1 -; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-LABEL: v_permlanex16_i16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: ; implicit-def: $vgpr5 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-SDAG-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 +; GFX11-SDAG-NEXT: global_store_b16 v[0:1], v5, off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_permlanex16_i16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-GISEL-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_permlanex16_i16: ; GFX12: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll index 6dd2258420998..5d4bfc30b6515 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll @@ -353,8 +353,11 @@ define amdgpu_kernel void @test_v_f64(ptr addrspace(1) %out, double %src0) #1 { define void @test_half(ptr addrspace(1) %out, half %src0) { ; GFX11-SDAG-LABEL: test_half: ; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: ; implicit-def: $vgpr3 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v3 ; GFX11-SDAG-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -372,15 +375,21 @@ define void @test_half(ptr addrspace(1) %out, half %src0) { define void @test_bfloat(ptr addrspace(1) %out, bfloat %src0) { ; GFX11-SDAG-LABEL: test_bfloat: ; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: ; implicit-def: $vgpr3 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v3 ; GFX11-SDAG-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_bfloat: ; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr3 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2 +; GFX11-GISEL-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v3 ; GFX11-GISEL-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %v = call bfloat @llvm.amdgcn.permlane64.bf16(bfloat %src0) @@ -391,8 +400,11 @@ define void @test_bfloat(ptr addrspace(1) %out, bfloat %src0) { define void @test_i16(ptr addrspace(1) %out, i16 %src0) { ; GFX11-SDAG-LABEL: test_i16: ; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: ; implicit-def: $vgpr3 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2 +; GFX11-SDAG-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v3 ; GFX11-SDAG-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll index ec7d7d467ffc6..9b8707f5a1508 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll @@ -31,7 +31,9 @@ define amdgpu_ps void @buffer_store_bf16(ptr addrspace(8) inreg %rsrc, bfloat %d ; ; GFX11-LABEL: buffer_store_bf16: ; GFX11: ; %bb.0: -; GFX11-NEXT: buffer_store_b16 v0, v1, s[0:3], 0 offen +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-NEXT: buffer_store_b16 v2, v1, s[0:3], 0 offen ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: buffer_store_bf16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll index f0031dd3e93c0..78473ec697a5a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll @@ -85,9 +85,10 @@ define amdgpu_ps <3 x float> @sample_gather(<8 x i32> inreg %rsrc, <4 x i32> inr define amdgpu_ps <3 x float> @sample_load(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, <8 x i32> inreg %rsrc2, i16 %s.16, i16 %t.16, i16 %fragid) { ; GFX11-TRUE16-LABEL: sample_load: ; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-TRUE16-NEXT: image_msaa_load v[0:3], v[2:3], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; GFX11-TRUE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D @@ -105,9 +106,10 @@ define amdgpu_ps <3 x float> @sample_load(<8 x i32> inreg %rsrc, <4 x i32> inreg ; ; GFX1150-TRUE16-LABEL: sample_load: ; GFX1150-TRUE16: ; %bb.0: +; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v4, 0 ; GFX1150-TRUE16-NEXT: image_msaa_load v[0:3], v[2:3], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -127,9 +129,11 @@ define amdgpu_ps <3 x float> @sample_load(<8 x i32> inreg %rsrc, <4 x i32> inreg ; ; GFX12-TRUE16-LABEL: sample_load: ; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-TRUE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 +; GFX12-TRUE16-NEXT: image_msaa_load v[0:3], [v0, v3], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 @@ -159,9 +163,10 @@ define amdgpu_ps <3 x float> @sample_load(<8 x i32> inreg %rsrc, <4 x i32> inreg define amdgpu_ps <3 x float> @load_sample(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, <8 x i32> inreg %rsrc2, i16 %s.16, i16 %t.16, i16 %fragid) { ; GFX11-TRUE16-LABEL: load_sample: ; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-TRUE16-NEXT: image_msaa_load v[0:3], v[2:3], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; GFX11-TRUE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D @@ -179,9 +184,10 @@ define amdgpu_ps <3 x float> @load_sample(<8 x i32> inreg %rsrc, <4 x i32> inreg ; ; GFX1150-TRUE16-LABEL: load_sample: ; GFX1150-TRUE16: ; %bb.0: +; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v4, 0 ; GFX1150-TRUE16-NEXT: image_msaa_load v[0:3], v[2:3], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -201,9 +207,11 @@ define amdgpu_ps <3 x float> @load_sample(<8 x i32> inreg %rsrc, <4 x i32> inreg ; ; GFX12-TRUE16-LABEL: load_sample: ; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-TRUE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 +; GFX12-TRUE16-NEXT: image_msaa_load v[0:3], [v0, v3], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll index a10c861601c2c..9e3aa29f3dc73 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -2074,10 +2074,12 @@ define void @test_writelane_half(ptr addrspace(1) %out, half %src, i32 %src1) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_load_u16 v4, v[0:1], off -; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-SDAG-NEXT: ; implicit-def: $vgpr5 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1100-SDAG-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v5 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-SDAG-NEXT: global_store_b16 v[0:1], v4, off ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2230,10 +2232,12 @@ define void @test_writelane_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1 ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_load_u16 v4, v[0:1], off -; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-SDAG-NEXT: ; implicit-def: $vgpr5 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1100-SDAG-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v5 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-SDAG-NEXT: global_store_b16 v[0:1], v4, off ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2266,10 +2270,12 @@ define void @test_writelane_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1 ; GFX1100-GISEL: ; %bb.0: ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-GISEL-NEXT: global_load_u16 v4, v[0:1], off -; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-GISEL-NEXT: ; implicit-def: $vgpr5 ; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1100-GISEL-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v5 ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-GISEL-NEXT: global_store_b16 v[0:1], v4, off ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2308,10 +2314,12 @@ define void @test_writelane_i16(ptr addrspace(1) %out, i16 %src, i32 %src1) { ; GFX1100-SDAG: ; %bb.0: ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-SDAG-NEXT: global_load_u16 v4, v[0:1], off -; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-SDAG-NEXT: ; implicit-def: $vgpr5 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1100-SDAG-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v5 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-SDAG-NEXT: global_store_b16 v[0:1], v4, off ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll index 22f562ab8557b..3eade74cccf3e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll @@ -215,9 +215,11 @@ define half @test_ldexp_f16_i8(half %a, i8 %b) { ; ; GFX11-SDAG-TRUE16-LABEL: test_ldexp_f16_i8: ; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v2, 0, 8 ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll index b241b9b800d2a..b055fa8494438 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll @@ -833,6 +833,7 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, s2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s2 @@ -995,11 +996,12 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s2, 16 ; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, s2 ; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.h, s3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v1.l, s2, v0.l ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v1.h, s3, v0.h @@ -1007,13 +1009,14 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cmp_ge_f16_e64 s6, |v1.l|, 0.5 ; GFX11-TRUE16-NEXT: v_cmp_ge_f16_e64 s7, |v1.h|, 0.5 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, 0, 0x3c00, s6 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, 0, 0x3c00, s7 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, 0, 0x3c00, s6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, 0, 0x3c00, s7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v2, v1 -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v3, v4 +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v3, v1 +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v4, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.h, v0.h, v2.l diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 5b2213592f495..579a8dd1e2c19 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -5831,17 +5831,17 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v4i1_to_v4i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s10, s6 -; GFX6-NEXT: s_mov_b32 s11, s7 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s10, s2 +; GFX6-NEXT: s_mov_b32 s11, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s8, s2 -; GFX6-NEXT: s_mov_b32 s9, s3 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: s_mov_b32 s9, s7 ; GFX6-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 2, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 3, v0 @@ -5854,8 +5854,8 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: constant_sextload_v4i1_to_v4i64: @@ -6215,11 +6215,12 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_load_ubyte v0, v[0:1] +; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: v_mov_b32_e32 v17, s1 ; GFX8-NEXT: v_mov_b32_e32 v16, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s3, v0 +; GFX8-NEXT: v_readfirstlane_b32 s3, v2 ; GFX8-NEXT: s_lshr_b32 s2, s3, 6 ; GFX8-NEXT: s_lshr_b32 s4, s3, 7 ; GFX8-NEXT: s_lshr_b32 s6, s3, 4 @@ -6326,9 +6327,10 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX12-LABEL: constant_sextload_v8i1_to_v8i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v16, 0 +; GFX12-NEXT: v_mov_b32_e32 v18, 0 +; GFX12-NEXT: ; implicit-def: $vgpr16_vgpr17 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_u8 v0, v16, s[2:3] +; GFX12-NEXT: global_load_u8 v0, v18, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s3, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6359,18 +6361,19 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX12-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 ; GFX12-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; GFX12-NEXT: s_clause 0x3 -; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48 -; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32 -; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] +; GFX12-NEXT: global_store_b128 v18, v[0:3], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v18, v[4:7], s[0:1] offset:32 +; GFX12-NEXT: global_store_b128 v18, v[8:11], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v18, v[12:15], s[0:1] ; GFX12-NEXT: s_endpgm ; ; GFX1250-LABEL: constant_sextload_v8i1_to_v8i64: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: v_mov_b32_e32 v16, 0 +; GFX1250-NEXT: v_mov_b32_e32 v18, 0 +; GFX1250-NEXT: ; implicit-def: $vgpr16_vgpr17 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_u8 v0, v16, s[2:3] +; GFX1250-NEXT: global_load_u8 v0, v18, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -6399,10 +6402,10 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX1250-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_ashrrev_i32 v13, 31, v12 ; GFX1250-NEXT: v_mov_b32_e32 v15, s15 ; GFX1250-NEXT: s_clause 0x3 -; GFX1250-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48 -; GFX1250-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32 -; GFX1250-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 -; GFX1250-NEXT: global_store_b128 v16, v[12:15], s[0:1] +; GFX1250-NEXT: global_store_b128 v18, v[0:3], s[0:1] offset:48 +; GFX1250-NEXT: global_store_b128 v18, v[4:7], s[0:1] offset:32 +; GFX1250-NEXT: global_store_b128 v18, v[8:11], s[0:1] offset:16 +; GFX1250-NEXT: global_store_b128 v18, v[12:15], s[0:1] ; GFX1250-NEXT: s_endpgm %load = load <8 x i1>, ptr addrspace(4) %in %ext = sext <8 x i1> %load to <8 x i64> @@ -6823,6 +6826,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8-NEXT: v_mov_b32_e32 v19, s1 ; GFX8-NEXT: v_mov_b32_e32 v18, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -7035,6 +7039,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX12-NEXT: global_load_u16 v0, v32, s[2:3] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s3, v0 +; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: s_lshr_b32 s4, s3, 15 ; GFX12-NEXT: s_lshr_b32 s2, s3, 14 ; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 @@ -7106,6 +7111,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX1250-NEXT: global_load_u16 v0, v32, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_readfirstlane_b32 s3, v0 +; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mov_b32_e32 v28, s3 ; GFX1250-NEXT: s_lshr_b32 s2, s3, 14 @@ -7799,16 +7805,16 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s38, s4, 30 +; GFX6-NEXT: s_lshr_b32 s36, s4, 30 ; GFX6-NEXT: s_lshr_b32 s40, s4, 31 -; GFX6-NEXT: s_lshr_b32 s34, s4, 28 -; GFX6-NEXT: s_lshr_b32 s36, s4, 29 -; GFX6-NEXT: s_lshr_b32 s28, s4, 26 -; GFX6-NEXT: s_lshr_b32 s30, s4, 27 -; GFX6-NEXT: s_lshr_b32 s24, s4, 24 -; GFX6-NEXT: s_lshr_b32 s26, s4, 25 +; GFX6-NEXT: s_lshr_b32 s30, s4, 28 +; GFX6-NEXT: s_lshr_b32 s38, s4, 29 +; GFX6-NEXT: s_lshr_b32 s26, s4, 26 +; GFX6-NEXT: s_lshr_b32 s34, s4, 27 +; GFX6-NEXT: s_lshr_b32 s22, s4, 24 +; GFX6-NEXT: s_lshr_b32 s28, s4, 25 ; GFX6-NEXT: s_lshr_b32 s20, s4, 22 -; GFX6-NEXT: s_lshr_b32 s22, s4, 23 +; GFX6-NEXT: s_lshr_b32 s24, s4, 23 ; GFX6-NEXT: s_lshr_b32 s18, s4, 20 ; GFX6-NEXT: s_lshr_b32 s6, s4, 21 ; GFX6-NEXT: s_lshr_b32 s8, s4, 18 @@ -7821,48 +7827,48 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX6-NEXT: v_mov_b32_e32 v0, s44 ; GFX6-NEXT: v_mov_b32_e32 v1, s45 ; GFX6-NEXT: s_lshr_b32 s44, s4, 12 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v2, s38 -; GFX6-NEXT: v_mov_b32_e32 v3, s39 -; GFX6-NEXT: s_lshr_b32 s38, s4, 13 +; GFX6-NEXT: v_mov_b32_e32 v2, s36 +; GFX6-NEXT: v_mov_b32_e32 v3, s37 +; GFX6-NEXT: s_lshr_b32 s36, s4, 13 ; GFX6-NEXT: v_mov_b32_e32 v4, s40 ; GFX6-NEXT: v_mov_b32_e32 v5, s41 ; GFX6-NEXT: s_lshr_b32 s40, s4, 10 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v6, s34 -; GFX6-NEXT: v_mov_b32_e32 v7, s35 -; GFX6-NEXT: s_lshr_b32 s34, s4, 11 -; GFX6-NEXT: v_mov_b32_e32 v8, s36 -; GFX6-NEXT: v_mov_b32_e32 v9, s37 -; GFX6-NEXT: s_lshr_b32 s36, s4, 8 -; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v10, s28 -; GFX6-NEXT: v_mov_b32_e32 v11, s29 -; GFX6-NEXT: s_lshr_b32 s28, s4, 9 -; GFX6-NEXT: v_mov_b32_e32 v12, s30 -; GFX6-NEXT: v_mov_b32_e32 v13, s31 -; GFX6-NEXT: s_lshr_b32 s30, s4, 6 -; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v6, s30 +; GFX6-NEXT: v_mov_b32_e32 v7, s31 +; GFX6-NEXT: s_lshr_b32 s30, s4, 11 +; GFX6-NEXT: v_mov_b32_e32 v8, s38 +; GFX6-NEXT: v_mov_b32_e32 v9, s39 +; GFX6-NEXT: s_lshr_b32 s38, s4, 8 ; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v14, s24 -; GFX6-NEXT: v_mov_b32_e32 v15, s25 -; GFX6-NEXT: s_lshr_b32 s24, s4, 7 -; GFX6-NEXT: v_mov_b32_e32 v16, s26 -; GFX6-NEXT: v_mov_b32_e32 v17, s27 -; GFX6-NEXT: s_lshr_b32 s26, s4, 4 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v10, s26 +; GFX6-NEXT: v_mov_b32_e32 v11, s27 +; GFX6-NEXT: s_lshr_b32 s26, s4, 9 +; GFX6-NEXT: v_mov_b32_e32 v12, s34 +; GFX6-NEXT: v_mov_b32_e32 v13, s35 +; GFX6-NEXT: s_lshr_b32 s34, s4, 6 ; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v14, s22 +; GFX6-NEXT: v_mov_b32_e32 v15, s23 +; GFX6-NEXT: s_lshr_b32 s22, s4, 7 +; GFX6-NEXT: v_mov_b32_e32 v16, s28 +; GFX6-NEXT: v_mov_b32_e32 v17, s29 +; GFX6-NEXT: s_lshr_b32 s28, s4, 4 +; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:240 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s20 ; GFX6-NEXT: v_mov_b32_e32 v3, s21 ; GFX6-NEXT: s_lshr_b32 s20, s4, 5 -; GFX6-NEXT: v_mov_b32_e32 v4, s22 -; GFX6-NEXT: v_mov_b32_e32 v5, s23 -; GFX6-NEXT: s_lshr_b32 s22, s4, 2 +; GFX6-NEXT: v_mov_b32_e32 v4, s24 +; GFX6-NEXT: v_mov_b32_e32 v5, s25 +; GFX6-NEXT: s_lshr_b32 s24, s4, 2 ; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:224 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -7872,16 +7878,16 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX6-NEXT: s_lshr_b32 s4, s4, 1 ; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 @@ -7917,36 +7923,36 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s44 ; GFX6-NEXT: v_mov_b32_e32 v3, s45 -; GFX6-NEXT: v_mov_b32_e32 v4, s38 -; GFX6-NEXT: v_mov_b32_e32 v5, s39 +; GFX6-NEXT: v_mov_b32_e32 v4, s36 +; GFX6-NEXT: v_mov_b32_e32 v5, s37 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s40 ; GFX6-NEXT: v_mov_b32_e32 v3, s41 -; GFX6-NEXT: v_mov_b32_e32 v4, s34 -; GFX6-NEXT: v_mov_b32_e32 v5, s35 +; GFX6-NEXT: v_mov_b32_e32 v4, s30 +; GFX6-NEXT: v_mov_b32_e32 v5, s31 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s36 -; GFX6-NEXT: v_mov_b32_e32 v3, s37 -; GFX6-NEXT: v_mov_b32_e32 v4, s28 -; GFX6-NEXT: v_mov_b32_e32 v5, s29 +; GFX6-NEXT: v_mov_b32_e32 v2, s38 +; GFX6-NEXT: v_mov_b32_e32 v3, s39 +; GFX6-NEXT: v_mov_b32_e32 v4, s26 +; GFX6-NEXT: v_mov_b32_e32 v5, s27 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s30 -; GFX6-NEXT: v_mov_b32_e32 v3, s31 -; GFX6-NEXT: v_mov_b32_e32 v4, s24 -; GFX6-NEXT: v_mov_b32_e32 v5, s25 +; GFX6-NEXT: v_mov_b32_e32 v2, s34 +; GFX6-NEXT: v_mov_b32_e32 v3, s35 +; GFX6-NEXT: v_mov_b32_e32 v4, s22 +; GFX6-NEXT: v_mov_b32_e32 v5, s23 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s26 -; GFX6-NEXT: v_mov_b32_e32 v3, s27 +; GFX6-NEXT: v_mov_b32_e32 v2, s28 +; GFX6-NEXT: v_mov_b32_e32 v3, s29 ; GFX6-NEXT: v_mov_b32_e32 v4, s20 ; GFX6-NEXT: v_mov_b32_e32 v5, s21 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s22 -; GFX6-NEXT: v_mov_b32_e32 v3, s23 +; GFX6-NEXT: v_mov_b32_e32 v2, s24 +; GFX6-NEXT: v_mov_b32_e32 v3, s25 ; GFX6-NEXT: v_mov_b32_e32 v4, s18 ; GFX6-NEXT: v_mov_b32_e32 v5, s19 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 @@ -7962,22 +7968,22 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s44, s2, 30 -; GFX8-NEXT: s_lshr_b32 s46, s2, 31 -; GFX8-NEXT: s_lshr_b32 s48, s2, 28 -; GFX8-NEXT: s_lshr_b32 s50, s2, 29 -; GFX8-NEXT: s_lshr_b32 s52, s2, 26 -; GFX8-NEXT: s_lshr_b32 s54, s2, 27 -; GFX8-NEXT: s_lshr_b32 s56, s2, 24 -; GFX8-NEXT: s_lshr_b32 s58, s2, 25 +; GFX8-NEXT: s_lshr_b32 s36, s2, 30 +; GFX8-NEXT: s_lshr_b32 s38, s2, 31 +; GFX8-NEXT: s_lshr_b32 s40, s2, 28 +; GFX8-NEXT: s_lshr_b32 s44, s2, 29 +; GFX8-NEXT: s_lshr_b32 s48, s2, 26 +; GFX8-NEXT: s_lshr_b32 s50, s2, 27 +; GFX8-NEXT: s_lshr_b32 s52, s2, 24 +; GFX8-NEXT: s_lshr_b32 s56, s2, 25 ; GFX8-NEXT: s_lshr_b32 s60, s2, 22 ; GFX8-NEXT: s_lshr_b32 s62, s2, 23 ; GFX8-NEXT: s_lshr_b32 s64, s2, 20 ; GFX8-NEXT: s_lshr_b32 s66, s2, 21 -; GFX8-NEXT: s_lshr_b32 s42, s2, 18 -; GFX8-NEXT: s_lshr_b32 s40, s2, 19 -; GFX8-NEXT: s_lshr_b32 s38, s2, 16 -; GFX8-NEXT: s_lshr_b32 s36, s2, 17 +; GFX8-NEXT: s_lshr_b32 s58, s2, 18 +; GFX8-NEXT: s_lshr_b32 s54, s2, 19 +; GFX8-NEXT: s_lshr_b32 s46, s2, 16 +; GFX8-NEXT: s_lshr_b32 s42, s2, 17 ; GFX8-NEXT: s_lshr_b32 s34, s2, 14 ; GFX8-NEXT: s_lshr_b32 s30, s2, 15 ; GFX8-NEXT: s_lshr_b32 s28, s2, 12 @@ -8009,94 +8015,92 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX8-NEXT: v_mov_b32_e32 v0, s44 -; GFX8-NEXT: s_add_u32 s44, s0, 0xf0 -; GFX8-NEXT: v_mov_b32_e32 v1, s45 -; GFX8-NEXT: s_addc_u32 s45, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s44 -; GFX8-NEXT: v_mov_b32_e32 v2, s46 -; GFX8-NEXT: v_mov_b32_e32 v3, s47 -; GFX8-NEXT: v_mov_b32_e32 v5, s45 -; GFX8-NEXT: s_add_u32 s44, s0, 0xe0 +; GFX8-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: s_add_u32 s36, s0, 0xf0 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: s_addc_u32 s37, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s36 +; GFX8-NEXT: v_mov_b32_e32 v2, s38 +; GFX8-NEXT: v_mov_b32_e32 v3, s39 +; GFX8-NEXT: v_mov_b32_e32 v5, s37 +; GFX8-NEXT: s_add_u32 s36, s0, 0xe0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_addc_u32 s37, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s36 +; GFX8-NEXT: v_mov_b32_e32 v0, s40 +; GFX8-NEXT: v_mov_b32_e32 v1, s41 +; GFX8-NEXT: v_mov_b32_e32 v2, s44 +; GFX8-NEXT: v_mov_b32_e32 v3, s45 +; GFX8-NEXT: v_mov_b32_e32 v5, s37 +; GFX8-NEXT: s_add_u32 s36, s0, 0xd0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_addc_u32 s45, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s44 +; GFX8-NEXT: s_addc_u32 s37, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s36 ; GFX8-NEXT: v_mov_b32_e32 v0, s48 ; GFX8-NEXT: v_mov_b32_e32 v1, s49 ; GFX8-NEXT: v_mov_b32_e32 v2, s50 ; GFX8-NEXT: v_mov_b32_e32 v3, s51 -; GFX8-NEXT: v_mov_b32_e32 v5, s45 -; GFX8-NEXT: s_add_u32 s44, s0, 0xd0 +; GFX8-NEXT: v_mov_b32_e32 v5, s37 +; GFX8-NEXT: s_add_u32 s36, s0, 0xc0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_addc_u32 s45, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s44 +; GFX8-NEXT: s_addc_u32 s37, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s36 ; GFX8-NEXT: v_mov_b32_e32 v0, s52 ; GFX8-NEXT: v_mov_b32_e32 v1, s53 -; GFX8-NEXT: v_mov_b32_e32 v2, s54 -; GFX8-NEXT: v_mov_b32_e32 v3, s55 -; GFX8-NEXT: v_mov_b32_e32 v5, s45 -; GFX8-NEXT: s_add_u32 s44, s0, 0xc0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_addc_u32 s45, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s44 -; GFX8-NEXT: v_mov_b32_e32 v0, s56 -; GFX8-NEXT: v_mov_b32_e32 v1, s57 -; GFX8-NEXT: v_mov_b32_e32 v2, s58 -; GFX8-NEXT: v_mov_b32_e32 v3, s59 -; GFX8-NEXT: v_mov_b32_e32 v5, s45 -; GFX8-NEXT: s_add_u32 s44, s0, 0xb0 +; GFX8-NEXT: v_mov_b32_e32 v2, s56 +; GFX8-NEXT: v_mov_b32_e32 v3, s57 +; GFX8-NEXT: v_mov_b32_e32 v5, s37 +; GFX8-NEXT: s_add_u32 s36, s0, 0xb0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_addc_u32 s45, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s44 +; GFX8-NEXT: s_addc_u32 s37, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s36 ; GFX8-NEXT: v_mov_b32_e32 v0, s60 ; GFX8-NEXT: v_mov_b32_e32 v1, s61 ; GFX8-NEXT: v_mov_b32_e32 v2, s62 ; GFX8-NEXT: v_mov_b32_e32 v3, s63 -; GFX8-NEXT: v_mov_b32_e32 v5, s45 -; GFX8-NEXT: s_add_u32 s44, s0, 0xa0 +; GFX8-NEXT: v_mov_b32_e32 v5, s37 +; GFX8-NEXT: s_add_u32 s36, s0, 0xa0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_addc_u32 s45, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s44 +; GFX8-NEXT: s_addc_u32 s37, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s36 ; GFX8-NEXT: v_mov_b32_e32 v0, s64 ; GFX8-NEXT: v_mov_b32_e32 v1, s65 ; GFX8-NEXT: v_mov_b32_e32 v2, s66 ; GFX8-NEXT: v_mov_b32_e32 v3, s67 -; GFX8-NEXT: v_mov_b32_e32 v5, s45 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s40 -; GFX8-NEXT: s_add_u32 s40, s0, 0x90 -; GFX8-NEXT: v_mov_b32_e32 v3, s41 -; GFX8-NEXT: s_addc_u32 s41, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s40 -; GFX8-NEXT: v_mov_b32_e32 v0, s42 -; GFX8-NEXT: v_mov_b32_e32 v1, s43 -; GFX8-NEXT: v_mov_b32_e32 v5, s41 +; GFX8-NEXT: v_mov_b32_e32 v5, s37 +; GFX8-NEXT: s_add_u32 s36, s0, 0x90 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s36 +; GFX8-NEXT: s_addc_u32 s37, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s36 +; GFX8-NEXT: v_mov_b32_e32 v0, s58 +; GFX8-NEXT: v_mov_b32_e32 v1, s59 +; GFX8-NEXT: v_mov_b32_e32 v2, s54 +; GFX8-NEXT: v_mov_b32_e32 v3, s55 +; GFX8-NEXT: v_mov_b32_e32 v5, s37 ; GFX8-NEXT: s_add_u32 s36, s0, 0x80 -; GFX8-NEXT: v_mov_b32_e32 v3, s37 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_addc_u32 s37, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v4, s36 -; GFX8-NEXT: v_mov_b32_e32 v0, s38 -; GFX8-NEXT: v_mov_b32_e32 v1, s39 +; GFX8-NEXT: v_mov_b32_e32 v0, s46 +; GFX8-NEXT: v_mov_b32_e32 v1, s47 +; GFX8-NEXT: v_mov_b32_e32 v2, s42 +; GFX8-NEXT: v_mov_b32_e32 v3, s43 ; GFX8-NEXT: v_mov_b32_e32 v5, s37 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -8374,38 +8378,38 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshr_b32 s34, s2, 30 -; GFX12-NEXT: s_lshr_b32 s36, s2, 31 -; GFX12-NEXT: s_lshr_b32 s38, s2, 28 -; GFX12-NEXT: s_lshr_b32 s40, s2, 29 -; GFX12-NEXT: s_lshr_b32 s42, s2, 26 -; GFX12-NEXT: s_lshr_b32 s44, s2, 27 -; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX12-NEXT: s_lshr_b32 s24, s2, 30 +; GFX12-NEXT: s_lshr_b32 s28, s2, 31 +; GFX12-NEXT: s_lshr_b32 s34, s2, 28 +; GFX12-NEXT: s_lshr_b32 s36, s2, 29 +; GFX12-NEXT: s_lshr_b32 s38, s2, 26 +; GFX12-NEXT: s_lshr_b32 s42, s2, 27 +; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 ; GFX12-NEXT: s_lshr_b32 s46, s2, 24 ; GFX12-NEXT: s_lshr_b32 s48, s2, 25 -; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s35 -; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s25 ; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v0, s34 :: v_dual_mov_b32 v3, s37 -; GFX12-NEXT: v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v5, s39 -; GFX12-NEXT: s_lshr_b32 s26, s2, 22 +; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v0, s24 :: v_dual_mov_b32 v3, s29 +; GFX12-NEXT: v_dual_mov_b32 v2, s28 :: v_dual_mov_b32 v5, s35 +; GFX12-NEXT: s_lshr_b32 s20, s2, 22 ; GFX12-NEXT: s_lshr_b32 s50, s2, 23 ; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v4, s38 :: v_dual_mov_b32 v7, s41 -; GFX12-NEXT: v_dual_mov_b32 v6, s40 :: v_dual_mov_b32 v9, s43 +; GFX12-NEXT: v_dual_mov_b32 v4, s34 :: v_dual_mov_b32 v7, s37 +; GFX12-NEXT: v_dual_mov_b32 v6, s36 :: v_dual_mov_b32 v9, s39 ; GFX12-NEXT: s_lshr_b32 s52, s2, 20 ; GFX12-NEXT: s_lshr_b32 s54, s2, 21 -; GFX12-NEXT: v_dual_mov_b32 v8, s42 :: v_dual_mov_b32 v11, s45 -; GFX12-NEXT: v_dual_mov_b32 v10, s44 :: v_dual_mov_b32 v13, s47 +; GFX12-NEXT: v_dual_mov_b32 v8, s38 :: v_dual_mov_b32 v11, s43 +; GFX12-NEXT: v_dual_mov_b32 v10, s42 :: v_dual_mov_b32 v13, s47 ; GFX12-NEXT: s_lshr_b32 s56, s2, 18 ; GFX12-NEXT: s_lshr_b32 s58, s2, 19 ; GFX12-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v12, s46 :: v_dual_mov_b32 v15, s49 -; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 ; GFX12-NEXT: v_mov_b32_e32 v14, s48 ; GFX12-NEXT: s_lshr_b32 s60, s2, 16 ; GFX12-NEXT: s_lshr_b32 s62, s2, 17 @@ -8420,18 +8424,18 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:224 ; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:208 ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:192 -; GFX12-NEXT: v_dual_mov_b32 v0, s26 :: v_dual_mov_b32 v3, s51 -; GFX12-NEXT: v_dual_mov_b32 v1, s27 :: v_dual_mov_b32 v2, s50 +; GFX12-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v3, s51 +; GFX12-NEXT: v_dual_mov_b32 v1, s21 :: v_dual_mov_b32 v2, s50 ; GFX12-NEXT: v_mov_b32_e32 v5, s53 -; GFX12-NEXT: s_lshr_b32 s30, s2, 12 -; GFX12-NEXT: s_lshr_b32 s28, s2, 13 -; GFX12-NEXT: s_lshr_b32 s24, s2, 10 -; GFX12-NEXT: s_lshr_b32 s22, s2, 11 +; GFX12-NEXT: s_lshr_b32 s44, s2, 12 +; GFX12-NEXT: s_lshr_b32 s40, s2, 13 +; GFX12-NEXT: s_lshr_b32 s30, s2, 10 +; GFX12-NEXT: s_lshr_b32 s26, s2, 11 ; GFX12-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v4, s52 :: v_dual_mov_b32 v7, s55 ; GFX12-NEXT: v_dual_mov_b32 v6, s54 :: v_dual_mov_b32 v9, s57 -; GFX12-NEXT: s_lshr_b32 s20, s2, 8 +; GFX12-NEXT: s_lshr_b32 s22, s2, 8 ; GFX12-NEXT: s_lshr_b32 s18, s2, 9 ; GFX12-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 @@ -8439,24 +8443,24 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: v_dual_mov_b32 v10, s58 :: v_dual_mov_b32 v13, s61 ; GFX12-NEXT: s_lshr_b32 s16, s2, 6 ; GFX12-NEXT: s_lshr_b32 s14, s2, 7 -; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v12, s60 :: v_dual_mov_b32 v15, s63 ; GFX12-NEXT: v_dual_mov_b32 v14, s62 :: v_dual_mov_b32 v17, s65 ; GFX12-NEXT: s_lshr_b32 s12, s2, 4 ; GFX12-NEXT: s_lshr_b32 s10, s2, 5 ; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v16, s64 :: v_dual_mov_b32 v19, s67 -; GFX12-NEXT: v_dual_mov_b32 v18, s66 :: v_dual_mov_b32 v21, s31 +; GFX12-NEXT: v_dual_mov_b32 v18, s66 :: v_dual_mov_b32 v21, s45 ; GFX12-NEXT: s_lshr_b32 s8, s2, 2 ; GFX12-NEXT: s_lshr_b32 s6, s2, 3 ; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v20, s30 :: v_dual_mov_b32 v23, s29 -; GFX12-NEXT: v_mov_b32_e32 v22, s28 +; GFX12-NEXT: v_dual_mov_b32 v20, s44 :: v_dual_mov_b32 v23, s41 +; GFX12-NEXT: v_mov_b32_e32 v22, s40 ; GFX12-NEXT: s_clause 0x5 ; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:176 ; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:160 @@ -8464,13 +8468,13 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:128 ; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:112 ; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] offset:96 -; GFX12-NEXT: v_dual_mov_b32 v0, s24 :: v_dual_mov_b32 v3, s23 -; GFX12-NEXT: v_dual_mov_b32 v1, s25 :: v_dual_mov_b32 v2, s22 -; GFX12-NEXT: v_mov_b32_e32 v5, s21 +; GFX12-NEXT: v_dual_mov_b32 v0, s30 :: v_dual_mov_b32 v3, s27 +; GFX12-NEXT: v_dual_mov_b32 v1, s31 :: v_dual_mov_b32 v2, s26 +; GFX12-NEXT: v_mov_b32_e32 v5, s23 ; GFX12-NEXT: s_lshr_b32 s68, s2, 1 ; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v7, s19 +; GFX12-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v7, s19 ; GFX12-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s17 ; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 @@ -8499,39 +8503,39 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_lshr_b32 s34, s2, 30 -; GFX1250-NEXT: s_lshr_b32 s36, s2, 31 -; GFX1250-NEXT: s_lshr_b32 s38, s2, 28 -; GFX1250-NEXT: s_lshr_b32 s40, s2, 29 -; GFX1250-NEXT: s_lshr_b32 s42, s2, 26 -; GFX1250-NEXT: s_lshr_b32 s44, s2, 27 -; GFX1250-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX1250-NEXT: s_lshr_b32 s24, s2, 30 +; GFX1250-NEXT: s_lshr_b32 s28, s2, 31 +; GFX1250-NEXT: s_lshr_b32 s34, s2, 28 +; GFX1250-NEXT: s_lshr_b32 s36, s2, 29 +; GFX1250-NEXT: s_lshr_b32 s38, s2, 26 +; GFX1250-NEXT: s_lshr_b32 s42, s2, 27 +; GFX1250-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 ; GFX1250-NEXT: s_lshr_b32 s46, s2, 24 ; GFX1250-NEXT: s_lshr_b32 s48, s2, 25 -; GFX1250-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v0, s34 -; GFX1250-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v2, s24 ; GFX1250-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v1, s35 :: v_dual_mov_b32 v2, s36 -; GFX1250-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, s38 -; GFX1250-NEXT: s_lshr_b32 s26, s2, 22 +; GFX1250-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX1250-NEXT: v_dual_mov_b32 v3, s25 :: v_dual_mov_b32 v4, s28 +; GFX1250-NEXT: v_dual_mov_b32 v5, s29 :: v_dual_mov_b32 v6, s34 +; GFX1250-NEXT: s_lshr_b32 s20, s2, 22 ; GFX1250-NEXT: s_lshr_b32 s50, s2, 23 ; GFX1250-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 ; GFX1250-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v5, s39 :: v_dual_mov_b32 v6, s40 -; GFX1250-NEXT: v_dual_mov_b32 v7, s41 :: v_dual_mov_b32 v8, s42 +; GFX1250-NEXT: v_dual_mov_b32 v7, s35 :: v_dual_mov_b32 v8, s36 +; GFX1250-NEXT: v_dual_mov_b32 v9, s37 :: v_dual_mov_b32 v10, s38 ; GFX1250-NEXT: s_lshr_b32 s52, s2, 20 ; GFX1250-NEXT: s_lshr_b32 s54, s2, 21 -; GFX1250-NEXT: v_dual_mov_b32 v9, s43 :: v_dual_mov_b32 v10, s44 -; GFX1250-NEXT: v_dual_mov_b32 v11, s45 :: v_dual_mov_b32 v12, s46 +; GFX1250-NEXT: v_dual_mov_b32 v11, s39 :: v_dual_mov_b32 v12, s42 +; GFX1250-NEXT: v_dual_mov_b32 v13, s43 :: v_dual_mov_b32 v14, s46 ; GFX1250-NEXT: s_lshr_b32 s56, s2, 18 ; GFX1250-NEXT: s_lshr_b32 s58, s2, 19 ; GFX1250-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v13, s47 :: v_dual_mov_b32 v14, s48 -; GFX1250-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX1250-NEXT: v_mov_b32_e32 v15, s49 +; GFX1250-NEXT: v_dual_mov_b32 v15, s47 :: v_dual_mov_b32 v16, s48 +; GFX1250-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX1250-NEXT: v_mov_b32_e32 v17, s49 ; GFX1250-NEXT: s_lshr_b32 s60, s2, 16 ; GFX1250-NEXT: s_lshr_b32 s62, s2, 17 ; GFX1250-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 @@ -8541,91 +8545,91 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX1250-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 ; GFX1250-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 ; GFX1250-NEXT: s_clause 0x3 -; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:240 -; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:224 -; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:208 -; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:192 +; GFX1250-NEXT: global_store_b128 v0, v[2:5], s[0:1] offset:240 +; GFX1250-NEXT: global_store_b128 v0, v[6:9], s[0:1] offset:224 +; GFX1250-NEXT: global_store_b128 v0, v[10:13], s[0:1] offset:208 +; GFX1250-NEXT: global_store_b128 v0, v[14:17], s[0:1] offset:192 ; GFX1250-NEXT: s_wait_xcnt 0x3 -; GFX1250-NEXT: v_dual_mov_b32 v0, s26 :: v_dual_mov_b32 v1, s27 -; GFX1250-NEXT: v_dual_mov_b32 v2, s50 :: v_dual_mov_b32 v3, s51 +; GFX1250-NEXT: v_dual_mov_b32 v2, s20 :: v_dual_mov_b32 v3, s21 +; GFX1250-NEXT: v_dual_mov_b32 v4, s50 :: v_dual_mov_b32 v5, s51 ; GFX1250-NEXT: s_wait_xcnt 0x2 -; GFX1250-NEXT: v_mov_b32_e32 v4, s52 -; GFX1250-NEXT: s_lshr_b32 s30, s2, 12 -; GFX1250-NEXT: s_lshr_b32 s28, s2, 13 -; GFX1250-NEXT: s_lshr_b32 s24, s2, 10 -; GFX1250-NEXT: s_lshr_b32 s22, s2, 11 +; GFX1250-NEXT: v_mov_b32_e32 v6, s52 +; GFX1250-NEXT: s_lshr_b32 s44, s2, 12 +; GFX1250-NEXT: s_lshr_b32 s40, s2, 13 +; GFX1250-NEXT: s_lshr_b32 s30, s2, 10 +; GFX1250-NEXT: s_lshr_b32 s26, s2, 11 ; GFX1250-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 ; GFX1250-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v5, s53 :: v_dual_mov_b32 v6, s54 +; GFX1250-NEXT: v_dual_mov_b32 v7, s53 :: v_dual_mov_b32 v8, s54 ; GFX1250-NEXT: s_wait_xcnt 0x1 -; GFX1250-NEXT: v_dual_mov_b32 v7, s55 :: v_dual_mov_b32 v8, s56 -; GFX1250-NEXT: s_lshr_b32 s20, s2, 8 +; GFX1250-NEXT: v_dual_mov_b32 v9, s55 :: v_dual_mov_b32 v10, s56 +; GFX1250-NEXT: s_lshr_b32 s22, s2, 8 ; GFX1250-NEXT: s_lshr_b32 s18, s2, 9 ; GFX1250-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 ; GFX1250-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v9, s57 :: v_dual_mov_b32 v10, s58 +; GFX1250-NEXT: v_dual_mov_b32 v11, s57 :: v_dual_mov_b32 v12, s58 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v11, s59 :: v_dual_mov_b32 v12, s60 +; GFX1250-NEXT: v_dual_mov_b32 v13, s59 :: v_dual_mov_b32 v14, s60 ; GFX1250-NEXT: s_lshr_b32 s16, s2, 6 ; GFX1250-NEXT: s_lshr_b32 s14, s2, 7 -; GFX1250-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 ; GFX1250-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v13, s61 :: v_dual_mov_b32 v14, s62 -; GFX1250-NEXT: v_dual_mov_b32 v15, s63 :: v_dual_mov_b32 v16, s64 +; GFX1250-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX1250-NEXT: v_dual_mov_b32 v15, s61 :: v_dual_mov_b32 v16, s62 +; GFX1250-NEXT: v_dual_mov_b32 v17, s63 :: v_dual_mov_b32 v18, s64 ; GFX1250-NEXT: s_lshr_b32 s12, s2, 4 ; GFX1250-NEXT: s_lshr_b32 s10, s2, 5 ; GFX1250-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v17, s65 :: v_dual_mov_b32 v18, s66 -; GFX1250-NEXT: v_dual_mov_b32 v19, s67 :: v_dual_mov_b32 v20, s30 +; GFX1250-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX1250-NEXT: v_dual_mov_b32 v19, s65 :: v_dual_mov_b32 v20, s66 +; GFX1250-NEXT: v_dual_mov_b32 v21, s67 :: v_dual_mov_b32 v22, s44 ; GFX1250-NEXT: s_lshr_b32 s8, s2, 2 ; GFX1250-NEXT: s_lshr_b32 s6, s2, 3 ; GFX1250-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 ; GFX1250-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v21, s31 :: v_dual_mov_b32 v22, s28 -; GFX1250-NEXT: v_mov_b32_e32 v23, s29 +; GFX1250-NEXT: v_dual_mov_b32 v23, s45 :: v_dual_mov_b32 v24, s40 +; GFX1250-NEXT: v_mov_b32_e32 v25, s41 ; GFX1250-NEXT: s_clause 0x5 -; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:176 -; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:160 -; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:144 -; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:128 -; GFX1250-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:112 -; GFX1250-NEXT: global_store_b128 v24, v[20:23], s[0:1] offset:96 +; GFX1250-NEXT: global_store_b128 v0, v[2:5], s[0:1] offset:176 +; GFX1250-NEXT: global_store_b128 v0, v[6:9], s[0:1] offset:160 +; GFX1250-NEXT: global_store_b128 v0, v[10:13], s[0:1] offset:144 +; GFX1250-NEXT: global_store_b128 v0, v[14:17], s[0:1] offset:128 +; GFX1250-NEXT: global_store_b128 v0, v[18:21], s[0:1] offset:112 +; GFX1250-NEXT: global_store_b128 v0, v[22:25], s[0:1] offset:96 ; GFX1250-NEXT: s_wait_xcnt 0x5 -; GFX1250-NEXT: v_dual_mov_b32 v0, s24 :: v_dual_mov_b32 v1, s25 -; GFX1250-NEXT: v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v3, s23 +; GFX1250-NEXT: v_dual_mov_b32 v2, s30 :: v_dual_mov_b32 v3, s31 +; GFX1250-NEXT: v_dual_mov_b32 v4, s26 :: v_dual_mov_b32 v5, s27 ; GFX1250-NEXT: s_wait_xcnt 0x4 -; GFX1250-NEXT: v_mov_b32_e32 v4, s20 +; GFX1250-NEXT: v_mov_b32_e32 v6, s22 ; GFX1250-NEXT: s_lshr_b32 s68, s2, 1 ; GFX1250-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 ; GFX1250-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v5, s21 :: v_dual_mov_b32 v6, s18 +; GFX1250-NEXT: v_dual_mov_b32 v7, s23 :: v_dual_mov_b32 v8, s18 ; GFX1250-NEXT: s_wait_xcnt 0x3 -; GFX1250-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s16 +; GFX1250-NEXT: v_dual_mov_b32 v9, s19 :: v_dual_mov_b32 v10, s16 ; GFX1250-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 ; GFX1250-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v9, s17 :: v_dual_mov_b32 v10, s14 +; GFX1250-NEXT: v_dual_mov_b32 v11, s17 :: v_dual_mov_b32 v12, s14 ; GFX1250-NEXT: s_wait_xcnt 0x2 -; GFX1250-NEXT: v_dual_mov_b32 v11, s15 :: v_dual_mov_b32 v12, s12 +; GFX1250-NEXT: v_dual_mov_b32 v13, s15 :: v_dual_mov_b32 v14, s12 ; GFX1250-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 ; GFX1250-NEXT: s_bfe_i64 s[2:3], s[68:69], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v13, s13 :: v_dual_mov_b32 v14, s10 +; GFX1250-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v16, s10 ; GFX1250-NEXT: s_wait_xcnt 0x1 -; GFX1250-NEXT: v_dual_mov_b32 v15, s11 :: v_dual_mov_b32 v16, s8 -; GFX1250-NEXT: v_dual_mov_b32 v17, s9 :: v_dual_mov_b32 v18, s6 +; GFX1250-NEXT: v_dual_mov_b32 v17, s11 :: v_dual_mov_b32 v18, s8 +; GFX1250-NEXT: v_dual_mov_b32 v19, s9 :: v_dual_mov_b32 v20, s6 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v19, s7 :: v_dual_mov_b32 v20, s4 -; GFX1250-NEXT: v_dual_mov_b32 v21, s5 :: v_dual_mov_b32 v22, s2 -; GFX1250-NEXT: v_mov_b32_e32 v23, s3 +; GFX1250-NEXT: v_dual_mov_b32 v21, s7 :: v_dual_mov_b32 v22, s4 +; GFX1250-NEXT: v_dual_mov_b32 v23, s5 :: v_dual_mov_b32 v24, s2 +; GFX1250-NEXT: v_mov_b32_e32 v25, s3 ; GFX1250-NEXT: s_clause 0x5 -; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:80 -; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:64 -; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:48 -; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:32 -; GFX1250-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16 -; GFX1250-NEXT: global_store_b128 v24, v[20:23], s[0:1] +; GFX1250-NEXT: global_store_b128 v0, v[2:5], s[0:1] offset:80 +; GFX1250-NEXT: global_store_b128 v0, v[6:9], s[0:1] offset:64 +; GFX1250-NEXT: global_store_b128 v0, v[10:13], s[0:1] offset:48 +; GFX1250-NEXT: global_store_b128 v0, v[14:17], s[0:1] offset:32 +; GFX1250-NEXT: global_store_b128 v0, v[18:21], s[0:1] offset:16 +; GFX1250-NEXT: global_store_b128 v0, v[22:25], s[0:1] ; GFX1250-NEXT: s_endpgm %load = load <32 x i1>, ptr addrspace(4) %in %ext = sext <32 x i1> %load to <32 x i64> @@ -9668,13 +9672,15 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX1250-NEXT: s_lshr_b32 s4, s3, 31 ; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1 ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1001e -; GFX1250-NEXT: s_bfe_u32 s6, s2, 0x10004 -; GFX1250-NEXT: s_and_b32 s7, s2, 1 +; GFX1250-NEXT: s_bfe_u32 s6, s2, 0x10006 +; GFX1250-NEXT: v_mov_b32_e32 v7, v1 +; GFX1250-NEXT: s_bfe_u32 s7, s2, 0x10004 ; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:416 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 ; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x1001d ; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1001c +; GFX1250-NEXT: s_and_b32 s8, s2, 1 ; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:496 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 @@ -9798,21 +9804,20 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 +; GFX1250-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10007 -; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10006 -; GFX1250-NEXT: v_mov_b32_e32 v7, v1 ; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:64 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4 +; GFX1250-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v2, s4 ; GFX1250-NEXT: s_mov_b32 s4, s3 -; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10001 +; GFX1250-NEXT: s_bfe_u32 s6, s3, 0x10001 ; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10005 -; GFX1250-NEXT: v_mov_b32_e32 v6, s5 +; GFX1250-NEXT: v_mov_b32_e32 v6, s6 ; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v2, s3 +; GFX1250-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v2, s3 ; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10003 -; GFX1250-NEXT: s_bfe_u32 s6, s2, 0x10001 +; GFX1250-NEXT: s_bfe_u32 s7, s2, 0x10001 ; GFX1250-NEXT: s_bfe_u32 s2, s2, 0x10002 ; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32 ; GFX1250-NEXT: s_wait_xcnt 0x0 @@ -9822,7 +9827,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX1250-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 ; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v2, s6 +; GFX1250-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v2, s7 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:256 ; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] @@ -9841,256 +9846,257 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: ; implicit-def: $sgpr40_sgpr41 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshr_b32 s42, s5, 30 -; GFX6-NEXT: s_lshr_b32 s36, s4, 30 -; GFX6-NEXT: s_lshr_b32 s38, s4, 31 -; GFX6-NEXT: s_lshr_b32 s30, s4, 28 -; GFX6-NEXT: s_lshr_b32 s34, s4, 29 -; GFX6-NEXT: s_lshr_b32 s26, s4, 26 -; GFX6-NEXT: s_lshr_b32 s28, s4, 27 -; GFX6-NEXT: s_lshr_b32 s22, s4, 24 -; GFX6-NEXT: s_lshr_b32 s24, s4, 25 -; GFX6-NEXT: s_lshr_b32 s18, s4, 22 -; GFX6-NEXT: s_lshr_b32 s20, s4, 23 -; GFX6-NEXT: s_lshr_b32 s14, s4, 20 -; GFX6-NEXT: s_lshr_b32 s16, s4, 21 -; GFX6-NEXT: s_lshr_b32 s10, s4, 18 -; GFX6-NEXT: s_lshr_b32 s12, s4, 19 +; GFX6-NEXT: s_lshr_b32 s34, s4, 30 +; GFX6-NEXT: s_lshr_b32 s36, s4, 31 +; GFX6-NEXT: s_lshr_b32 s28, s4, 28 +; GFX6-NEXT: s_lshr_b32 s30, s4, 29 +; GFX6-NEXT: s_lshr_b32 s24, s4, 26 +; GFX6-NEXT: s_lshr_b32 s26, s4, 27 +; GFX6-NEXT: s_lshr_b32 s20, s4, 24 +; GFX6-NEXT: s_lshr_b32 s22, s4, 25 +; GFX6-NEXT: s_lshr_b32 s16, s4, 22 +; GFX6-NEXT: s_lshr_b32 s18, s4, 23 +; GFX6-NEXT: s_lshr_b32 s12, s4, 20 +; GFX6-NEXT: s_lshr_b32 s14, s4, 21 +; GFX6-NEXT: s_lshr_b32 s8, s4, 18 +; GFX6-NEXT: s_lshr_b32 s10, s4, 19 ; GFX6-NEXT: s_lshr_b32 s6, s4, 16 -; GFX6-NEXT: s_lshr_b32 s8, s4, 17 ; GFX6-NEXT: s_ashr_i32 s7, s5, 31 ; GFX6-NEXT: s_bfe_i64 s[44:45], s[4:5], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v4, s7 -; GFX6-NEXT: s_lshr_b32 s40, s4, 14 +; GFX6-NEXT: s_lshr_b32 s38, s4, 17 ; GFX6-NEXT: v_mov_b32_e32 v0, s44 ; GFX6-NEXT: v_mov_b32_e32 v1, s45 -; GFX6-NEXT: s_mov_b32 s44, s5 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v6, s44 -; GFX6-NEXT: v_mov_b32_e32 v7, s45 -; GFX6-NEXT: s_lshr_b32 s44, s4, 15 -; GFX6-NEXT: v_mov_b32_e32 v2, s42 -; GFX6-NEXT: v_mov_b32_e32 v3, s43 -; GFX6-NEXT: s_lshr_b32 s42, s4, 12 +; GFX6-NEXT: s_lshr_b32 s44, s4, 14 +; GFX6-NEXT: s_mov_b32 s40, s5 +; GFX6-NEXT: s_bfe_i64 s[46:47], s[40:41], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[42:43], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v6, s46 +; GFX6-NEXT: v_mov_b32_e32 v7, s47 +; GFX6-NEXT: s_lshr_b32 s42, s4, 15 +; GFX6-NEXT: v_mov_b32_e32 v2, s40 +; GFX6-NEXT: v_mov_b32_e32 v3, s41 +; GFX6-NEXT: s_lshr_b32 s40, s4, 12 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v8, s36 -; GFX6-NEXT: v_mov_b32_e32 v9, s37 -; GFX6-NEXT: s_lshr_b32 s36, s4, 13 -; GFX6-NEXT: v_mov_b32_e32 v10, s38 -; GFX6-NEXT: v_mov_b32_e32 v11, s39 -; GFX6-NEXT: s_lshr_b32 s38, s4, 10 +; GFX6-NEXT: v_mov_b32_e32 v8, s34 +; GFX6-NEXT: v_mov_b32_e32 v9, s35 +; GFX6-NEXT: s_lshr_b32 s34, s4, 13 +; GFX6-NEXT: v_mov_b32_e32 v10, s36 +; GFX6-NEXT: v_mov_b32_e32 v11, s37 +; GFX6-NEXT: s_lshr_b32 s36, s4, 10 +; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v12, s30 -; GFX6-NEXT: v_mov_b32_e32 v13, s31 -; GFX6-NEXT: s_lshr_b32 s30, s4, 11 -; GFX6-NEXT: v_mov_b32_e32 v14, s34 -; GFX6-NEXT: v_mov_b32_e32 v15, s35 -; GFX6-NEXT: s_lshr_b32 s34, s4, 8 -; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v12, s28 +; GFX6-NEXT: v_mov_b32_e32 v13, s29 +; GFX6-NEXT: s_lshr_b32 s28, s4, 11 +; GFX6-NEXT: v_mov_b32_e32 v14, s30 +; GFX6-NEXT: v_mov_b32_e32 v15, s31 +; GFX6-NEXT: s_lshr_b32 s30, s4, 8 ; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v5, s7 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:496 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s26 -; GFX6-NEXT: v_mov_b32_e32 v3, s27 -; GFX6-NEXT: s_lshr_b32 s26, s4, 9 -; GFX6-NEXT: v_mov_b32_e32 v4, s28 -; GFX6-NEXT: v_mov_b32_e32 v5, s29 -; GFX6-NEXT: s_lshr_b32 s28, s4, 6 -; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v2, s24 +; GFX6-NEXT: v_mov_b32_e32 v3, s25 +; GFX6-NEXT: s_lshr_b32 s24, s4, 9 +; GFX6-NEXT: v_mov_b32_e32 v4, s26 +; GFX6-NEXT: v_mov_b32_e32 v5, s27 +; GFX6-NEXT: s_lshr_b32 s26, s4, 6 ; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:240 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s22 -; GFX6-NEXT: v_mov_b32_e32 v9, s23 -; GFX6-NEXT: s_lshr_b32 s22, s4, 7 -; GFX6-NEXT: v_mov_b32_e32 v10, s24 -; GFX6-NEXT: v_mov_b32_e32 v11, s25 -; GFX6-NEXT: s_lshr_b32 s24, s4, 4 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v8, s20 +; GFX6-NEXT: v_mov_b32_e32 v9, s21 +; GFX6-NEXT: s_lshr_b32 s20, s4, 7 +; GFX6-NEXT: v_mov_b32_e32 v10, s22 +; GFX6-NEXT: v_mov_b32_e32 v11, s23 +; GFX6-NEXT: s_lshr_b32 s22, s4, 4 ; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:224 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v12, s18 -; GFX6-NEXT: v_mov_b32_e32 v13, s19 -; GFX6-NEXT: s_lshr_b32 s18, s4, 5 -; GFX6-NEXT: v_mov_b32_e32 v14, s20 -; GFX6-NEXT: v_mov_b32_e32 v15, s21 -; GFX6-NEXT: s_lshr_b32 s20, s4, 2 -; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v12, s16 +; GFX6-NEXT: v_mov_b32_e32 v13, s17 +; GFX6-NEXT: s_lshr_b32 s16, s4, 5 +; GFX6-NEXT: v_mov_b32_e32 v14, s18 +; GFX6-NEXT: v_mov_b32_e32 v15, s19 +; GFX6-NEXT: s_lshr_b32 s18, s4, 2 ; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:208 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s14 -; GFX6-NEXT: v_mov_b32_e32 v3, s15 -; GFX6-NEXT: s_lshr_b32 s14, s4, 3 -; GFX6-NEXT: v_mov_b32_e32 v4, s16 -; GFX6-NEXT: v_mov_b32_e32 v5, s17 -; GFX6-NEXT: s_lshr_b32 s16, s4, 1 -; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v2, s12 +; GFX6-NEXT: v_mov_b32_e32 v3, s13 +; GFX6-NEXT: s_lshr_b32 s12, s4, 3 +; GFX6-NEXT: v_mov_b32_e32 v4, s14 +; GFX6-NEXT: v_mov_b32_e32 v5, s15 +; GFX6-NEXT: s_lshr_b32 s14, s4, 1 ; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s10 -; GFX6-NEXT: v_mov_b32_e32 v9, s11 -; GFX6-NEXT: s_lshr_b32 s10, s5, 29 -; GFX6-NEXT: v_mov_b32_e32 v10, s12 -; GFX6-NEXT: v_mov_b32_e32 v11, s13 -; GFX6-NEXT: s_lshr_b32 s12, s5, 28 -; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v8, s8 +; GFX6-NEXT: v_mov_b32_e32 v9, s9 +; GFX6-NEXT: s_lshr_b32 s8, s5, 29 +; GFX6-NEXT: v_mov_b32_e32 v10, s10 +; GFX6-NEXT: v_mov_b32_e32 v11, s11 +; GFX6-NEXT: s_lshr_b32 s10, s5, 28 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:176 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v12, s6 ; GFX6-NEXT: v_mov_b32_e32 v13, s7 ; GFX6-NEXT: s_lshr_b32 s6, s5, 26 -; GFX6-NEXT: v_mov_b32_e32 v14, s8 -; GFX6-NEXT: v_mov_b32_e32 v15, s9 -; GFX6-NEXT: s_lshr_b32 s8, s5, 27 +; GFX6-NEXT: v_mov_b32_e32 v14, s38 +; GFX6-NEXT: v_mov_b32_e32 v15, s39 +; GFX6-NEXT: s_lshr_b32 s38, s5, 27 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s40 -; GFX6-NEXT: v_mov_b32_e32 v3, s41 -; GFX6-NEXT: s_lshr_b32 s40, s5, 25 -; GFX6-NEXT: v_mov_b32_e32 v4, s44 -; GFX6-NEXT: v_mov_b32_e32 v5, s45 -; GFX6-NEXT: s_lshr_b32 s44, s5, 24 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v2, s44 +; GFX6-NEXT: v_mov_b32_e32 v3, s45 +; GFX6-NEXT: s_lshr_b32 s44, s5, 25 +; GFX6-NEXT: v_mov_b32_e32 v4, s42 +; GFX6-NEXT: v_mov_b32_e32 v5, s43 +; GFX6-NEXT: s_lshr_b32 s42, s5, 24 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s42 -; GFX6-NEXT: v_mov_b32_e32 v9, s43 -; GFX6-NEXT: s_lshr_b32 s42, s5, 22 -; GFX6-NEXT: v_mov_b32_e32 v10, s36 -; GFX6-NEXT: v_mov_b32_e32 v11, s37 -; GFX6-NEXT: s_lshr_b32 s36, s5, 23 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v8, s40 +; GFX6-NEXT: v_mov_b32_e32 v9, s41 +; GFX6-NEXT: s_lshr_b32 s40, s5, 22 +; GFX6-NEXT: v_mov_b32_e32 v10, s34 +; GFX6-NEXT: v_mov_b32_e32 v11, s35 +; GFX6-NEXT: s_lshr_b32 s34, s5, 23 +; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v12, s38 -; GFX6-NEXT: v_mov_b32_e32 v13, s39 -; GFX6-NEXT: s_lshr_b32 s38, s5, 20 -; GFX6-NEXT: v_mov_b32_e32 v14, s30 -; GFX6-NEXT: v_mov_b32_e32 v15, s31 +; GFX6-NEXT: v_mov_b32_e32 v12, s36 +; GFX6-NEXT: v_mov_b32_e32 v13, s37 +; GFX6-NEXT: s_lshr_b32 s36, s5, 20 +; GFX6-NEXT: v_mov_b32_e32 v14, s28 +; GFX6-NEXT: v_mov_b32_e32 v15, s29 ; GFX6-NEXT: s_lshr_b32 s4, s5, 21 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[34:35], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[28:29], s[30:31], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s30 -; GFX6-NEXT: v_mov_b32_e32 v3, s31 -; GFX6-NEXT: s_lshr_b32 s30, s5, 18 -; GFX6-NEXT: v_mov_b32_e32 v4, s26 -; GFX6-NEXT: v_mov_b32_e32 v5, s27 -; GFX6-NEXT: s_lshr_b32 s26, s5, 19 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v2, s28 +; GFX6-NEXT: v_mov_b32_e32 v3, s29 +; GFX6-NEXT: s_lshr_b32 s28, s5, 18 +; GFX6-NEXT: v_mov_b32_e32 v4, s24 +; GFX6-NEXT: v_mov_b32_e32 v5, s25 +; GFX6-NEXT: s_lshr_b32 s24, s5, 19 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s28 -; GFX6-NEXT: v_mov_b32_e32 v9, s29 -; GFX6-NEXT: s_lshr_b32 s28, s5, 17 -; GFX6-NEXT: v_mov_b32_e32 v10, s22 -; GFX6-NEXT: v_mov_b32_e32 v11, s23 -; GFX6-NEXT: s_lshr_b32 s22, s5, 16 -; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v8, s26 +; GFX6-NEXT: v_mov_b32_e32 v9, s27 +; GFX6-NEXT: s_lshr_b32 s26, s5, 17 +; GFX6-NEXT: v_mov_b32_e32 v10, s20 +; GFX6-NEXT: v_mov_b32_e32 v11, s21 +; GFX6-NEXT: s_lshr_b32 s20, s5, 16 +; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v12, s24 -; GFX6-NEXT: v_mov_b32_e32 v13, s25 -; GFX6-NEXT: s_lshr_b32 s24, s5, 14 -; GFX6-NEXT: v_mov_b32_e32 v14, s18 -; GFX6-NEXT: v_mov_b32_e32 v15, s19 -; GFX6-NEXT: s_lshr_b32 s18, s5, 15 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v12, s22 +; GFX6-NEXT: v_mov_b32_e32 v13, s23 +; GFX6-NEXT: s_lshr_b32 s22, s5, 14 +; GFX6-NEXT: v_mov_b32_e32 v14, s16 +; GFX6-NEXT: v_mov_b32_e32 v15, s17 +; GFX6-NEXT: s_lshr_b32 s16, s5, 15 +; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:64 -; GFX6-NEXT: v_mov_b32_e32 v16, s20 -; GFX6-NEXT: v_mov_b32_e32 v17, s21 -; GFX6-NEXT: s_lshr_b32 s20, s5, 12 +; GFX6-NEXT: v_mov_b32_e32 v16, s18 +; GFX6-NEXT: v_mov_b32_e32 v17, s19 +; GFX6-NEXT: s_lshr_b32 s18, s5, 12 +; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v18, s14 -; GFX6-NEXT: v_mov_b32_e32 v19, s15 -; GFX6-NEXT: s_lshr_b32 s14, s5, 13 +; GFX6-NEXT: v_mov_b32_e32 v18, s12 +; GFX6-NEXT: v_mov_b32_e32 v19, s13 +; GFX6-NEXT: s_lshr_b32 s12, s5, 13 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s16 -; GFX6-NEXT: v_mov_b32_e32 v3, s17 -; GFX6-NEXT: s_lshr_b32 s16, s5, 10 -; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v2, s14 +; GFX6-NEXT: v_mov_b32_e32 v3, s15 +; GFX6-NEXT: s_lshr_b32 s14, s5, 10 ; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s12 -; GFX6-NEXT: v_mov_b32_e32 v9, s13 -; GFX6-NEXT: s_lshr_b32 s12, s5, 11 -; GFX6-NEXT: v_mov_b32_e32 v10, s10 -; GFX6-NEXT: v_mov_b32_e32 v11, s11 -; GFX6-NEXT: s_lshr_b32 s10, s5, 8 -; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v8, s10 +; GFX6-NEXT: v_mov_b32_e32 v9, s11 +; GFX6-NEXT: s_lshr_b32 s10, s5, 11 +; GFX6-NEXT: v_mov_b32_e32 v10, s8 +; GFX6-NEXT: v_mov_b32_e32 v11, s9 +; GFX6-NEXT: s_lshr_b32 s8, s5, 8 +; GFX6-NEXT: s_bfe_i64 s[30:31], s[38:39], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v12, s6 ; GFX6-NEXT: v_mov_b32_e32 v13, s7 ; GFX6-NEXT: s_lshr_b32 s6, s5, 9 -; GFX6-NEXT: v_mov_b32_e32 v14, s8 -; GFX6-NEXT: v_mov_b32_e32 v15, s9 -; GFX6-NEXT: s_lshr_b32 s8, s5, 6 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[44:45], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v14, s30 +; GFX6-NEXT: v_mov_b32_e32 v15, s31 +; GFX6-NEXT: s_lshr_b32 s30, s5, 6 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[42:43], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[44:45], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v16, s34 -; GFX6-NEXT: v_mov_b32_e32 v17, s35 -; GFX6-NEXT: s_lshr_b32 s34, s5, 7 -; GFX6-NEXT: v_mov_b32_e32 v18, s40 -; GFX6-NEXT: v_mov_b32_e32 v19, s41 -; GFX6-NEXT: s_lshr_b32 s40, s5, 4 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v16, s38 +; GFX6-NEXT: v_mov_b32_e32 v17, s39 +; GFX6-NEXT: s_lshr_b32 s38, s5, 7 +; GFX6-NEXT: v_mov_b32_e32 v18, s42 +; GFX6-NEXT: v_mov_b32_e32 v19, s43 +; GFX6-NEXT: s_lshr_b32 s42, s5, 4 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s42 -; GFX6-NEXT: v_mov_b32_e32 v1, s43 -; GFX6-NEXT: s_lshr_b32 s42, s5, 5 -; GFX6-NEXT: v_mov_b32_e32 v2, s36 -; GFX6-NEXT: v_mov_b32_e32 v3, s37 -; GFX6-NEXT: s_lshr_b32 s36, s5, 2 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v0, s40 +; GFX6-NEXT: v_mov_b32_e32 v1, s41 +; GFX6-NEXT: s_lshr_b32 s40, s5, 5 +; GFX6-NEXT: v_mov_b32_e32 v2, s34 +; GFX6-NEXT: v_mov_b32_e32 v3, s35 +; GFX6-NEXT: s_lshr_b32 s34, s5, 2 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:480 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s38 -; GFX6-NEXT: v_mov_b32_e32 v9, s39 -; GFX6-NEXT: s_lshr_b32 s38, s5, 3 +; GFX6-NEXT: v_mov_b32_e32 v8, s36 +; GFX6-NEXT: v_mov_b32_e32 v9, s37 +; GFX6-NEXT: s_lshr_b32 s36, s5, 3 ; GFX6-NEXT: s_lshr_b32 s44, s5, 1 ; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:464 ; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:448 @@ -10099,58 +10105,58 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: v_mov_b32_e32 v11, s5 ; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:416 ; GFX6-NEXT: s_waitcnt expcnt(1) -; GFX6-NEXT: v_mov_b32_e32 v0, s30 -; GFX6-NEXT: v_mov_b32_e32 v1, s31 +; GFX6-NEXT: v_mov_b32_e32 v0, s28 +; GFX6-NEXT: v_mov_b32_e32 v1, s29 +; GFX6-NEXT: v_mov_b32_e32 v2, s24 +; GFX6-NEXT: v_mov_b32_e32 v3, s25 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:400 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NEXT: v_mov_b32_e32 v1, s21 ; GFX6-NEXT: v_mov_b32_e32 v2, s26 ; GFX6-NEXT: v_mov_b32_e32 v3, s27 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:400 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:384 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s22 ; GFX6-NEXT: v_mov_b32_e32 v1, s23 -; GFX6-NEXT: v_mov_b32_e32 v2, s28 -; GFX6-NEXT: v_mov_b32_e32 v3, s29 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:384 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s24 -; GFX6-NEXT: v_mov_b32_e32 v1, s25 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: v_mov_b32_e32 v3, s19 +; GFX6-NEXT: v_mov_b32_e32 v2, s16 +; GFX6-NEXT: v_mov_b32_e32 v3, s17 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:368 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s20 -; GFX6-NEXT: v_mov_b32_e32 v1, s21 -; GFX6-NEXT: v_mov_b32_e32 v2, s14 -; GFX6-NEXT: v_mov_b32_e32 v3, s15 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:352 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s16 -; GFX6-NEXT: v_mov_b32_e32 v1, s17 +; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: v_mov_b32_e32 v1, s19 ; GFX6-NEXT: v_mov_b32_e32 v2, s12 ; GFX6-NEXT: v_mov_b32_e32 v3, s13 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:352 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: v_mov_b32_e32 v1, s15 +; GFX6-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NEXT: v_mov_b32_e32 v3, s11 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:336 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: v_mov_b32_e32 v1, s11 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: v_mov_b32_e32 v1, s9 ; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: v_mov_b32_e32 v3, s7 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:320 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: v_mov_b32_e32 v1, s9 -; GFX6-NEXT: v_mov_b32_e32 v2, s34 -; GFX6-NEXT: v_mov_b32_e32 v3, s35 +; GFX6-NEXT: v_mov_b32_e32 v0, s30 +; GFX6-NEXT: v_mov_b32_e32 v1, s31 +; GFX6-NEXT: v_mov_b32_e32 v2, s38 +; GFX6-NEXT: v_mov_b32_e32 v3, s39 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:304 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s40 -; GFX6-NEXT: v_mov_b32_e32 v1, s41 -; GFX6-NEXT: v_mov_b32_e32 v2, s42 -; GFX6-NEXT: v_mov_b32_e32 v3, s43 +; GFX6-NEXT: v_mov_b32_e32 v0, s42 +; GFX6-NEXT: v_mov_b32_e32 v1, s43 +; GFX6-NEXT: v_mov_b32_e32 v2, s40 +; GFX6-NEXT: v_mov_b32_e32 v3, s41 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:288 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s36 -; GFX6-NEXT: v_mov_b32_e32 v1, s37 -; GFX6-NEXT: v_mov_b32_e32 v2, s38 -; GFX6-NEXT: v_mov_b32_e32 v3, s39 +; GFX6-NEXT: v_mov_b32_e32 v0, s34 +; GFX6-NEXT: v_mov_b32_e32 v1, s35 +; GFX6-NEXT: v_mov_b32_e32 v2, s36 +; GFX6-NEXT: v_mov_b32_e32 v3, s37 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:272 ; GFX6-NEXT: v_mov_b32_e32 v8, s44 ; GFX6-NEXT: v_mov_b32_e32 v9, s45 @@ -10161,44 +10167,45 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX8-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; GFX8-NEXT: ; implicit-def: $sgpr22_sgpr23 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[10:11], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s0, s3, 8 ; GFX8-NEXT: s_lshr_b32 s48, s3, 15 ; GFX8-NEXT: v_writelane_b32 v62, s0, 0 -; GFX8-NEXT: s_lshr_b32 s74, s3, 30 -; GFX8-NEXT: s_lshr_b32 s30, s3, 31 -; GFX8-NEXT: s_lshr_b32 s72, s3, 28 -; GFX8-NEXT: s_lshr_b32 s34, s3, 29 -; GFX8-NEXT: s_lshr_b32 s70, s3, 26 +; GFX8-NEXT: s_lshr_b32 s64, s3, 30 +; GFX8-NEXT: s_lshr_b32 s24, s3, 31 +; GFX8-NEXT: s_lshr_b32 s68, s3, 28 +; GFX8-NEXT: s_lshr_b32 s30, s3, 29 +; GFX8-NEXT: s_lshr_b32 s72, s3, 26 ; GFX8-NEXT: s_lshr_b32 s36, s3, 27 -; GFX8-NEXT: s_lshr_b32 s68, s3, 24 +; GFX8-NEXT: s_lshr_b32 s74, s3, 24 ; GFX8-NEXT: s_lshr_b32 s38, s3, 25 -; GFX8-NEXT: s_lshr_b32 s64, s3, 22 +; GFX8-NEXT: s_lshr_b32 s70, s3, 22 ; GFX8-NEXT: s_lshr_b32 s40, s3, 23 -; GFX8-NEXT: s_lshr_b32 s60, s3, 20 +; GFX8-NEXT: s_lshr_b32 s66, s3, 20 ; GFX8-NEXT: s_lshr_b32 s42, s3, 21 -; GFX8-NEXT: s_lshr_b32 s66, s3, 18 +; GFX8-NEXT: s_lshr_b32 s62, s3, 18 ; GFX8-NEXT: s_lshr_b32 s44, s3, 19 ; GFX8-NEXT: s_lshr_b32 s56, s3, 16 ; GFX8-NEXT: s_lshr_b32 s46, s3, 17 ; GFX8-NEXT: s_lshr_b32 s58, s3, 14 -; GFX8-NEXT: s_lshr_b32 s62, s3, 12 +; GFX8-NEXT: s_lshr_b32 s60, s3, 12 ; GFX8-NEXT: s_lshr_b32 s54, s3, 10 ; GFX8-NEXT: v_writelane_b32 v62, s1, 1 ; GFX8-NEXT: s_lshr_b32 s0, s3, 9 ; GFX8-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 ; GFX8-NEXT: s_lshr_b32 s52, s3, 11 ; GFX8-NEXT: v_writelane_b32 v62, s0, 2 -; GFX8-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 @@ -10208,8 +10215,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX8-NEXT: v_mov_b32_e32 v34, s48 ; GFX8-NEXT: s_lshr_b32 s48, s2, 1 ; GFX8-NEXT: s_lshr_b32 s50, s3, 13 @@ -10222,30 +10229,30 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_lshr_b32 s18, s3, 3 ; GFX8-NEXT: s_lshr_b32 s20, s3, 1 ; GFX8-NEXT: s_mov_b32 s22, s3 -; GFX8-NEXT: s_lshr_b32 s24, s2, 30 -; GFX8-NEXT: s_lshr_b32 s26, s2, 31 -; GFX8-NEXT: s_lshr_b32 s28, s2, 28 -; GFX8-NEXT: v_mov_b32_e32 v4, s74 -; GFX8-NEXT: v_mov_b32_e32 v12, s72 -; GFX8-NEXT: v_mov_b32_e32 v0, s70 -; GFX8-NEXT: v_mov_b32_e32 v8, s68 -; GFX8-NEXT: v_mov_b32_e32 v16, s64 -; GFX8-NEXT: v_mov_b32_e32 v20, s60 -; GFX8-NEXT: v_mov_b32_e32 v24, s66 +; GFX8-NEXT: s_lshr_b32 s26, s2, 30 +; GFX8-NEXT: s_lshr_b32 s28, s2, 31 +; GFX8-NEXT: s_lshr_b32 s34, s2, 28 +; GFX8-NEXT: v_mov_b32_e32 v4, s64 +; GFX8-NEXT: v_mov_b32_e32 v12, s68 +; GFX8-NEXT: v_mov_b32_e32 v0, s72 +; GFX8-NEXT: v_mov_b32_e32 v8, s74 +; GFX8-NEXT: v_mov_b32_e32 v16, s70 +; GFX8-NEXT: v_mov_b32_e32 v20, s66 +; GFX8-NEXT: v_mov_b32_e32 v24, s62 ; GFX8-NEXT: v_mov_b32_e32 v28, s56 ; GFX8-NEXT: v_mov_b32_e32 v32, s58 -; GFX8-NEXT: v_mov_b32_e32 v36, s62 +; GFX8-NEXT: v_mov_b32_e32 v36, s60 ; GFX8-NEXT: s_lshr_b32 s86, s2, 29 ; GFX8-NEXT: v_mov_b32_e32 v40, s54 ; GFX8-NEXT: s_lshr_b32 s84, s2, 26 ; GFX8-NEXT: s_lshr_b32 s82, s2, 27 ; GFX8-NEXT: s_bfe_i64 vcc, s[52:53], 0x10000 ; GFX8-NEXT: s_lshr_b32 s80, s2, 24 -; GFX8-NEXT: v_mov_b32_e32 v6, s30 -; GFX8-NEXT: v_mov_b32_e32 v7, s31 +; GFX8-NEXT: v_mov_b32_e32 v6, s24 +; GFX8-NEXT: v_mov_b32_e32 v7, s25 ; GFX8-NEXT: s_lshr_b32 s78, s2, 25 ; GFX8-NEXT: s_lshr_b32 s76, s2, 22 -; GFX8-NEXT: v_mov_b32_e32 v14, s34 +; GFX8-NEXT: v_mov_b32_e32 v14, s30 ; GFX8-NEXT: s_lshr_b32 s74, s2, 23 ; GFX8-NEXT: s_lshr_b32 s72, s2, 20 ; GFX8-NEXT: v_mov_b32_e32 v2, s36 @@ -10272,8 +10279,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_lshr_b32 s40, s2, 4 ; GFX8-NEXT: s_lshr_b32 s38, s2, 5 ; GFX8-NEXT: s_lshr_b32 s36, s2, 2 -; GFX8-NEXT: s_lshr_b32 s34, s2, 3 -; GFX8-NEXT: s_bfe_i64 s[30:31], s[2:3], 0x10000 +; GFX8-NEXT: s_lshr_b32 s30, s2, 3 +; GFX8-NEXT: s_bfe_i64 s[24:25], s[2:3], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[2:3], s[48:49], 0x10000 ; GFX8-NEXT: v_writelane_b32 v62, s2, 4 ; GFX8-NEXT: v_writelane_b32 v62, s3, 5 @@ -10287,26 +10294,26 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_i64 s[6:7], s[2:3], 0x10000 ; GFX8-NEXT: v_readlane_b32 s2, v62, 0 ; GFX8-NEXT: v_readlane_b32 s3, v62, 1 -; GFX8-NEXT: v_mov_b32_e32 v5, s75 -; GFX8-NEXT: v_mov_b32_e32 v13, s73 -; GFX8-NEXT: v_mov_b32_e32 v15, s35 -; GFX8-NEXT: v_mov_b32_e32 v1, s71 +; GFX8-NEXT: v_mov_b32_e32 v5, s65 +; GFX8-NEXT: v_mov_b32_e32 v13, s69 +; GFX8-NEXT: v_mov_b32_e32 v15, s31 +; GFX8-NEXT: v_mov_b32_e32 v1, s73 ; GFX8-NEXT: v_mov_b32_e32 v3, s37 -; GFX8-NEXT: v_mov_b32_e32 v9, s69 +; GFX8-NEXT: v_mov_b32_e32 v9, s75 ; GFX8-NEXT: v_mov_b32_e32 v11, s39 -; GFX8-NEXT: v_mov_b32_e32 v17, s65 +; GFX8-NEXT: v_mov_b32_e32 v17, s71 ; GFX8-NEXT: v_mov_b32_e32 v19, s41 -; GFX8-NEXT: v_mov_b32_e32 v21, s61 +; GFX8-NEXT: v_mov_b32_e32 v21, s67 ; GFX8-NEXT: v_mov_b32_e32 v23, s43 -; GFX8-NEXT: v_mov_b32_e32 v25, s67 +; GFX8-NEXT: v_mov_b32_e32 v25, s63 ; GFX8-NEXT: v_mov_b32_e32 v27, s45 ; GFX8-NEXT: v_mov_b32_e32 v29, s57 ; GFX8-NEXT: v_mov_b32_e32 v31, s47 ; GFX8-NEXT: v_mov_b32_e32 v33, s59 ; GFX8-NEXT: v_mov_b32_e32 v35, s49 -; GFX8-NEXT: v_mov_b32_e32 v37, s63 +; GFX8-NEXT: v_mov_b32_e32 v37, s61 ; GFX8-NEXT: v_mov_b32_e32 v41, s55 -; GFX8-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 @@ -10332,9 +10339,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_i64 s[82:83], s[82:83], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[84:85], s[84:85], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[86:87], s[86:87], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 @@ -10450,17 +10457,17 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: s_add_u32 s0, s8, 0xe0 -; GFX8-NEXT: v_mov_b32_e32 v0, s24 -; GFX8-NEXT: v_mov_b32_e32 v1, s25 -; GFX8-NEXT: v_mov_b32_e32 v2, s26 -; GFX8-NEXT: v_mov_b32_e32 v3, s27 +; GFX8-NEXT: v_mov_b32_e32 v0, s26 +; GFX8-NEXT: v_mov_b32_e32 v1, s27 +; GFX8-NEXT: v_mov_b32_e32 v2, s28 +; GFX8-NEXT: v_mov_b32_e32 v3, s29 ; GFX8-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: s_add_u32 s0, s8, 0xd0 -; GFX8-NEXT: v_mov_b32_e32 v0, s28 -; GFX8-NEXT: v_mov_b32_e32 v1, s29 +; GFX8-NEXT: v_mov_b32_e32 v0, s34 +; GFX8-NEXT: v_mov_b32_e32 v1, s35 ; GFX8-NEXT: v_mov_b32_e32 v2, s86 ; GFX8-NEXT: v_mov_b32_e32 v3, s87 ; GFX8-NEXT: s_addc_u32 s1, s9, 0 @@ -10576,15 +10583,15 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s36 ; GFX8-NEXT: v_mov_b32_e32 v1, s37 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: v_mov_b32_e32 v2, s30 +; GFX8-NEXT: v_mov_b32_e32 v3, s31 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_readlane_b32 s0, v62, 4 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_readlane_b32 s1, v62, 5 ; GFX8-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NEXT: v_mov_b32_e32 v0, s30 -; GFX8-NEXT: v_mov_b32_e32 v1, s31 +; GFX8-NEXT: v_mov_b32_e32 v0, s24 +; GFX8-NEXT: v_mov_b32_e32 v1, s25 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v5, s9 @@ -10962,74 +10969,75 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-LABEL: constant_sextload_v64i1_to_v64i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_load_b64 s[10:11], s[2:3], 0x0 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshr_b32 s96, s11, 30 -; GFX12-NEXT: s_lshr_b32 s98, s11, 31 -; GFX12-NEXT: s_lshr_b32 s92, s11, 28 -; GFX12-NEXT: s_lshr_b32 s94, s11, 29 -; GFX12-NEXT: s_lshr_b32 s78, s11, 26 -; GFX12-NEXT: s_lshr_b32 s88, s11, 27 +; GFX12-NEXT: s_lshr_b32 s96, s3, 30 +; GFX12-NEXT: s_lshr_b32 s98, s3, 31 +; GFX12-NEXT: s_lshr_b32 s92, s3, 28 +; GFX12-NEXT: s_lshr_b32 s94, s3, 29 +; GFX12-NEXT: s_lshr_b32 s88, s3, 26 +; GFX12-NEXT: s_lshr_b32 s90, s3, 27 ; GFX12-NEXT: s_bfe_i64 s[96:97], s[96:97], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[100:101], s[98:99], 0x10000 -; GFX12-NEXT: s_lshr_b32 s66, s11, 24 -; GFX12-NEXT: s_lshr_b32 s74, s11, 25 +; GFX12-NEXT: s_lshr_b32 s80, s3, 24 +; GFX12-NEXT: s_lshr_b32 s86, s3, 25 ; GFX12-NEXT: s_bfe_i64 s[92:93], s[92:93], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[94:95], s[94:95], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s96 -; GFX12-NEXT: s_lshr_b32 s56, s11, 22 -; GFX12-NEXT: s_lshr_b32 s62, s11, 23 +; GFX12-NEXT: s_lshr_b32 s70, s3, 22 +; GFX12-NEXT: s_lshr_b32 s76, s3, 23 ; GFX12-NEXT: v_dual_mov_b32 v2, s97 :: v_dual_mov_b32 v3, s100 ; GFX12-NEXT: v_dual_mov_b32 v4, s101 :: v_dual_mov_b32 v5, s92 -; GFX12-NEXT: s_bfe_i64 s[78:79], s[78:79], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[88:89], s[88:89], 0x10000 -; GFX12-NEXT: s_lshr_b32 s44, s11, 20 -; GFX12-NEXT: s_lshr_b32 s52, s11, 21 -; GFX12-NEXT: s_lshr_b32 s30, s11, 18 -; GFX12-NEXT: s_lshr_b32 s40, s11, 19 -; GFX12-NEXT: s_lshr_b32 s18, s11, 16 -; GFX12-NEXT: s_lshr_b32 s26, s11, 17 -; GFX12-NEXT: s_lshr_b32 s2, s11, 14 -; GFX12-NEXT: s_lshr_b32 s4, s11, 15 +; GFX12-NEXT: s_bfe_i64 s[90:91], s[90:91], 0x10000 +; GFX12-NEXT: s_lshr_b32 s60, s3, 20 +; GFX12-NEXT: s_lshr_b32 s66, s3, 21 +; GFX12-NEXT: s_lshr_b32 s50, s3, 18 +; GFX12-NEXT: s_lshr_b32 s56, s3, 19 +; GFX12-NEXT: s_lshr_b32 s40, s3, 16 +; GFX12-NEXT: s_lshr_b32 s48, s3, 17 +; GFX12-NEXT: s_lshr_b32 s6, s3, 14 +; GFX12-NEXT: s_lshr_b32 s8, s3, 15 ; GFX12-NEXT: v_dual_mov_b32 v6, s93 :: v_dual_mov_b32 v7, s94 -; GFX12-NEXT: v_dual_mov_b32 v8, s95 :: v_dual_mov_b32 v9, s78 +; GFX12-NEXT: v_dual_mov_b32 v8, s95 :: v_dual_mov_b32 v9, s88 +; GFX12-NEXT: s_bfe_i64 s[80:81], s[80:81], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[86:87], s[86:87], 0x10000 +; GFX12-NEXT: s_lshr_b32 s10, s3, 12 +; GFX12-NEXT: s_lshr_b32 s12, s3, 13 +; GFX12-NEXT: v_dual_mov_b32 v10, s89 :: v_dual_mov_b32 v11, s90 +; GFX12-NEXT: v_dual_mov_b32 v12, s91 :: v_dual_mov_b32 v13, s80 +; GFX12-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[76:77], s[76:77], 0x10000 +; GFX12-NEXT: s_lshr_b32 s14, s3, 10 +; GFX12-NEXT: s_lshr_b32 s16, s3, 11 +; GFX12-NEXT: v_dual_mov_b32 v14, s81 :: v_dual_mov_b32 v15, s86 +; GFX12-NEXT: v_dual_mov_b32 v16, s87 :: v_dual_mov_b32 v17, s70 +; GFX12-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000 -; GFX12-NEXT: s_lshr_b32 s6, s11, 12 -; GFX12-NEXT: s_lshr_b32 s8, s11, 13 -; GFX12-NEXT: v_dual_mov_b32 v10, s79 :: v_dual_mov_b32 v11, s88 -; GFX12-NEXT: v_dual_mov_b32 v12, s89 :: v_dual_mov_b32 v13, s66 +; GFX12-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 -; GFX12-NEXT: s_lshr_b32 s12, s11, 10 -; GFX12-NEXT: s_lshr_b32 s14, s11, 11 -; GFX12-NEXT: v_dual_mov_b32 v14, s67 :: v_dual_mov_b32 v15, s74 -; GFX12-NEXT: v_dual_mov_b32 v16, s75 :: v_dual_mov_b32 v17, s56 -; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 -; GFX12-NEXT: s_lshr_b32 s16, s11, 8 -; GFX12-NEXT: s_lshr_b32 s20, s11, 9 -; GFX12-NEXT: v_dual_mov_b32 v18, s57 :: v_dual_mov_b32 v19, s62 -; GFX12-NEXT: v_dual_mov_b32 v20, s63 :: v_dual_mov_b32 v21, s44 +; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX12-NEXT: s_lshr_b32 s22, s11, 6 -; GFX12-NEXT: s_lshr_b32 s24, s11, 7 -; GFX12-NEXT: v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v23, s52 -; GFX12-NEXT: v_dual_mov_b32 v24, s53 :: v_dual_mov_b32 v25, s30 -; GFX12-NEXT: v_dual_mov_b32 v26, s31 :: v_dual_mov_b32 v27, s40 -; GFX12-NEXT: v_dual_mov_b32 v28, s41 :: v_dual_mov_b32 v29, s18 -; GFX12-NEXT: v_dual_mov_b32 v30, s19 :: v_dual_mov_b32 v31, s26 -; GFX12-NEXT: v_mov_b32_e32 v32, s27 -; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX12-NEXT: s_lshr_b32 s18, s3, 8 +; GFX12-NEXT: s_lshr_b32 s20, s3, 9 +; GFX12-NEXT: v_dual_mov_b32 v18, s71 :: v_dual_mov_b32 v19, s76 +; GFX12-NEXT: v_dual_mov_b32 v20, s77 :: v_dual_mov_b32 v21, s60 ; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX12-NEXT: s_lshr_b32 s22, s3, 6 +; GFX12-NEXT: s_lshr_b32 s24, s3, 7 +; GFX12-NEXT: v_dual_mov_b32 v22, s61 :: v_dual_mov_b32 v23, s66 +; GFX12-NEXT: v_dual_mov_b32 v24, s67 :: v_dual_mov_b32 v25, s50 +; GFX12-NEXT: v_dual_mov_b32 v26, s51 :: v_dual_mov_b32 v27, s56 +; GFX12-NEXT: v_dual_mov_b32 v28, s57 :: v_dual_mov_b32 v29, s40 +; GFX12-NEXT: v_dual_mov_b32 v30, s41 :: v_dual_mov_b32 v31, s48 +; GFX12-NEXT: v_mov_b32_e32 v32, s49 +; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 ; GFX12-NEXT: s_clause 0x7 ; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] offset:496 ; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:480 @@ -11039,43 +11047,43 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] offset:416 ; GFX12-NEXT: global_store_b128 v0, v[25:28], s[0:1] offset:400 ; GFX12-NEXT: global_store_b128 v0, v[29:32], s[0:1] offset:384 -; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 -; GFX12-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5 -; GFX12-NEXT: v_mov_b32_e32 v5, s6 -; GFX12-NEXT: s_lshr_b32 s28, s11, 4 -; GFX12-NEXT: s_lshr_b32 s34, s11, 5 -; GFX12-NEXT: s_lshr_b32 s36, s11, 2 -; GFX12-NEXT: s_lshr_b32 s38, s11, 3 +; GFX12-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 +; GFX12-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v4, s9 +; GFX12-NEXT: v_mov_b32_e32 v5, s10 +; GFX12-NEXT: s_lshr_b32 s26, s3, 4 +; GFX12-NEXT: s_lshr_b32 s28, s3, 5 +; GFX12-NEXT: s_lshr_b32 s30, s3, 2 +; GFX12-NEXT: s_lshr_b32 s34, s3, 3 ; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8 -; GFX12-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s12 -; GFX12-NEXT: s_lshr_b32 s42, s11, 1 -; GFX12-NEXT: s_mov_b32 s46, s11 +; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v7, s12 +; GFX12-NEXT: v_dual_mov_b32 v8, s13 :: v_dual_mov_b32 v9, s14 +; GFX12-NEXT: s_lshr_b32 s36, s3, 1 +; GFX12-NEXT: s_mov_b32 s4, s3 ; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14 -; GFX12-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16 -; GFX12-NEXT: s_lshr_b32 s48, s10, 30 -; GFX12-NEXT: s_lshr_b32 s50, s10, 31 -; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v10, s15 :: v_dual_mov_b32 v11, s16 +; GFX12-NEXT: v_dual_mov_b32 v12, s17 :: v_dual_mov_b32 v13, s18 +; GFX12-NEXT: s_lshr_b32 s38, s2, 30 +; GFX12-NEXT: s_lshr_b32 s42, s2, 31 ; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s20 +; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v14, s19 :: v_dual_mov_b32 v15, s20 ; GFX12-NEXT: v_dual_mov_b32 v16, s21 :: v_dual_mov_b32 v17, s22 -; GFX12-NEXT: s_lshr_b32 s54, s10, 28 -; GFX12-NEXT: s_lshr_b32 s58, s10, 29 -; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX12-NEXT: s_lshr_b32 s44, s2, 28 +; GFX12-NEXT: s_lshr_b32 s46, s2, 29 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v18, s23 :: v_dual_mov_b32 v19, s24 -; GFX12-NEXT: v_dual_mov_b32 v20, s25 :: v_dual_mov_b32 v21, s28 -; GFX12-NEXT: s_lshr_b32 s60, s10, 26 -; GFX12-NEXT: s_lshr_b32 s64, s10, 27 -; GFX12-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v22, s29 :: v_dual_mov_b32 v23, s34 -; GFX12-NEXT: v_mov_b32_e32 v24, s35 +; GFX12-NEXT: v_dual_mov_b32 v20, s25 :: v_dual_mov_b32 v21, s26 +; GFX12-NEXT: s_lshr_b32 s52, s2, 26 +; GFX12-NEXT: s_lshr_b32 s54, s2, 27 +; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v22, s27 :: v_dual_mov_b32 v23, s28 +; GFX12-NEXT: v_mov_b32_e32 v24, s29 ; GFX12-NEXT: s_clause 0x5 ; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] offset:368 ; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:352 @@ -11083,50 +11091,50 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v0, v[13:16], s[0:1] offset:320 ; GFX12-NEXT: global_store_b128 v0, v[17:20], s[0:1] offset:304 ; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] offset:288 -; GFX12-NEXT: v_dual_mov_b32 v1, s36 :: v_dual_mov_b32 v2, s37 -; GFX12-NEXT: v_dual_mov_b32 v3, s38 :: v_dual_mov_b32 v4, s39 -; GFX12-NEXT: v_mov_b32_e32 v5, s46 -; GFX12-NEXT: s_lshr_b32 s68, s10, 24 -; GFX12-NEXT: s_lshr_b32 s70, s10, 25 -; GFX12-NEXT: s_lshr_b32 s72, s10, 22 -; GFX12-NEXT: s_lshr_b32 s76, s10, 23 -; GFX12-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v1, s30 :: v_dual_mov_b32 v2, s31 +; GFX12-NEXT: v_dual_mov_b32 v3, s34 :: v_dual_mov_b32 v4, s35 +; GFX12-NEXT: v_mov_b32_e32 v5, s4 +; GFX12-NEXT: s_lshr_b32 s58, s2, 24 +; GFX12-NEXT: s_lshr_b32 s62, s2, 25 +; GFX12-NEXT: s_lshr_b32 s64, s2, 22 +; GFX12-NEXT: s_lshr_b32 s68, s2, 23 +; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v6, s5 :: v_dual_mov_b32 v7, s36 +; GFX12-NEXT: v_dual_mov_b32 v8, s37 :: v_dual_mov_b32 v9, s38 +; GFX12-NEXT: s_lshr_b32 s72, s2, 20 +; GFX12-NEXT: s_lshr_b32 s74, s2, 21 ; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v6, s47 :: v_dual_mov_b32 v7, s42 -; GFX12-NEXT: v_dual_mov_b32 v8, s43 :: v_dual_mov_b32 v9, s48 -; GFX12-NEXT: s_lshr_b32 s80, s10, 20 -; GFX12-NEXT: s_lshr_b32 s82, s10, 21 +; GFX12-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v10, s39 :: v_dual_mov_b32 v11, s42 +; GFX12-NEXT: v_dual_mov_b32 v12, s43 :: v_dual_mov_b32 v13, s44 +; GFX12-NEXT: s_lshr_b32 s78, s2, 18 +; GFX12-NEXT: s_lshr_b32 s82, s2, 19 +; GFX12-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v10, s49 :: v_dual_mov_b32 v11, s50 -; GFX12-NEXT: v_dual_mov_b32 v12, s51 :: v_dual_mov_b32 v13, s54 -; GFX12-NEXT: s_lshr_b32 s84, s10, 18 -; GFX12-NEXT: s_lshr_b32 s86, s10, 19 -; GFX12-NEXT: s_bfe_i64 s[76:77], s[76:77], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v14, s45 :: v_dual_mov_b32 v15, s46 +; GFX12-NEXT: v_dual_mov_b32 v16, s47 :: v_dual_mov_b32 v17, s52 +; GFX12-NEXT: s_lshr_b32 s84, s2, 16 +; GFX12-NEXT: s_lshr_b32 s98, s2, 17 +; GFX12-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v14, s55 :: v_dual_mov_b32 v15, s58 -; GFX12-NEXT: v_dual_mov_b32 v16, s59 :: v_dual_mov_b32 v17, s60 -; GFX12-NEXT: s_lshr_b32 s90, s10, 16 -; GFX12-NEXT: s_lshr_b32 s98, s10, 17 +; GFX12-NEXT: v_dual_mov_b32 v18, s53 :: v_dual_mov_b32 v19, s54 +; GFX12-NEXT: v_dual_mov_b32 v20, s55 :: v_dual_mov_b32 v21, s58 +; GFX12-NEXT: s_lshr_b32 s96, s2, 14 +; GFX12-NEXT: s_lshr_b32 s100, s2, 15 +; GFX12-NEXT: s_lshr_b32 s94, s2, 13 +; GFX12-NEXT: s_lshr_b32 s90, s2, 11 +; GFX12-NEXT: s_lshr_b32 s86, s2, 9 +; GFX12-NEXT: s_lshr_b32 s76, s2, 7 +; GFX12-NEXT: s_lshr_b32 s66, s2, 5 +; GFX12-NEXT: s_lshr_b32 s56, s2, 3 +; GFX12-NEXT: s_lshr_b32 s48, s2, 1 ; GFX12-NEXT: s_bfe_i64 s[82:83], s[82:83], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[80:81], s[80:81], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v18, s61 :: v_dual_mov_b32 v19, s64 -; GFX12-NEXT: v_dual_mov_b32 v20, s65 :: v_dual_mov_b32 v21, s68 -; GFX12-NEXT: s_lshr_b32 s96, s10, 14 -; GFX12-NEXT: s_lshr_b32 s100, s10, 15 -; GFX12-NEXT: s_lshr_b32 s94, s10, 13 -; GFX12-NEXT: s_lshr_b32 s88, s10, 11 -; GFX12-NEXT: s_lshr_b32 s74, s10, 9 -; GFX12-NEXT: s_lshr_b32 s62, s10, 7 -; GFX12-NEXT: s_lshr_b32 s52, s10, 5 -; GFX12-NEXT: s_lshr_b32 s40, s10, 3 -; GFX12-NEXT: s_lshr_b32 s26, s10, 1 -; GFX12-NEXT: s_bfe_i64 s[86:87], s[86:87], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[84:85], s[84:85], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v22, s69 :: v_dual_mov_b32 v23, s70 -; GFX12-NEXT: v_mov_b32_e32 v24, s71 +; GFX12-NEXT: s_bfe_i64 s[78:79], s[78:79], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v22, s59 :: v_dual_mov_b32 v23, s62 +; GFX12-NEXT: v_mov_b32_e32 v24, s63 ; GFX12-NEXT: s_clause 0x5 ; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] offset:272 ; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:256 @@ -11134,43 +11142,43 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v0, v[13:16], s[0:1] offset:224 ; GFX12-NEXT: global_store_b128 v0, v[17:20], s[0:1] offset:208 ; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] offset:192 -; GFX12-NEXT: v_dual_mov_b32 v1, s72 :: v_dual_mov_b32 v2, s73 -; GFX12-NEXT: v_dual_mov_b32 v3, s76 :: v_dual_mov_b32 v4, s77 -; GFX12-NEXT: v_mov_b32_e32 v5, s80 -; GFX12-NEXT: s_lshr_b32 s92, s10, 12 -; GFX12-NEXT: s_lshr_b32 s78, s10, 10 +; GFX12-NEXT: v_dual_mov_b32 v1, s64 :: v_dual_mov_b32 v2, s65 +; GFX12-NEXT: v_dual_mov_b32 v3, s68 :: v_dual_mov_b32 v4, s69 +; GFX12-NEXT: v_mov_b32_e32 v5, s72 +; GFX12-NEXT: s_lshr_b32 s92, s2, 12 +; GFX12-NEXT: s_lshr_b32 s88, s2, 10 ; GFX12-NEXT: s_bfe_i64 s[98:99], s[98:99], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[90:91], s[90:91], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v6, s81 :: v_dual_mov_b32 v7, s82 -; GFX12-NEXT: v_dual_mov_b32 v8, s83 :: v_dual_mov_b32 v9, s84 -; GFX12-NEXT: s_lshr_b32 s66, s10, 8 -; GFX12-NEXT: s_lshr_b32 s56, s10, 6 -; GFX12-NEXT: s_lshr_b32 s44, s10, 4 -; GFX12-NEXT: s_lshr_b32 s30, s10, 2 -; GFX12-NEXT: s_bfe_i64 s[18:19], s[10:11], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[84:85], s[84:85], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v6, s73 :: v_dual_mov_b32 v7, s74 +; GFX12-NEXT: v_dual_mov_b32 v8, s75 :: v_dual_mov_b32 v9, s78 +; GFX12-NEXT: s_lshr_b32 s80, s2, 8 +; GFX12-NEXT: s_lshr_b32 s70, s2, 6 +; GFX12-NEXT: s_lshr_b32 s60, s2, 4 +; GFX12-NEXT: s_lshr_b32 s50, s2, 2 +; GFX12-NEXT: s_bfe_i64 s[40:41], s[2:3], 0x10000 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_bfe_i64 s[10:11], s[26:27], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[26:27], s[40:41], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[40:41], s[52:53], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[52:53], s[62:63], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[62:63], s[74:75], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[74:75], s[88:89], 0x10000 -; GFX12-NEXT: s_bfe_i64 s[88:89], s[94:95], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[48:49], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[48:49], s[56:57], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[56:57], s[66:67], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[66:67], s[76:77], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[76:77], s[86:87], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[86:87], s[90:91], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[90:91], s[94:95], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[94:95], s[100:101], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[96:97], s[96:97], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v10, s85 :: v_dual_mov_b32 v11, s86 -; GFX12-NEXT: v_dual_mov_b32 v12, s87 :: v_dual_mov_b32 v13, s90 -; GFX12-NEXT: s_bfe_i64 s[78:79], s[78:79], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v10, s79 :: v_dual_mov_b32 v11, s82 +; GFX12-NEXT: v_dual_mov_b32 v12, s83 :: v_dual_mov_b32 v13, s84 +; GFX12-NEXT: s_bfe_i64 s[88:89], s[88:89], 0x10000 ; GFX12-NEXT: s_bfe_i64 s[92:93], s[92:93], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v14, s91 :: v_dual_mov_b32 v15, s98 +; GFX12-NEXT: v_dual_mov_b32 v14, s85 :: v_dual_mov_b32 v15, s98 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v16, s99 :: v_dual_mov_b32 v17, s96 -; GFX12-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 +; GFX12-NEXT: s_bfe_i64 s[80:81], s[80:81], 0x10000 ; GFX12-NEXT: v_dual_mov_b32 v18, s97 :: v_dual_mov_b32 v19, s94 ; GFX12-NEXT: v_dual_mov_b32 v20, s95 :: v_dual_mov_b32 v21, s92 -; GFX12-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v22, s93 :: v_dual_mov_b32 v23, s88 -; GFX12-NEXT: v_mov_b32_e32 v24, s89 +; GFX12-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v22, s93 :: v_dual_mov_b32 v23, s90 +; GFX12-NEXT: v_mov_b32_e32 v24, s91 ; GFX12-NEXT: s_clause 0x5 ; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] offset:176 ; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:160 @@ -11178,23 +11186,23 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v0, v[13:16], s[0:1] offset:128 ; GFX12-NEXT: global_store_b128 v0, v[17:20], s[0:1] offset:112 ; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] offset:96 -; GFX12-NEXT: v_dual_mov_b32 v1, s78 :: v_dual_mov_b32 v2, s79 -; GFX12-NEXT: v_dual_mov_b32 v3, s74 :: v_dual_mov_b32 v4, s75 +; GFX12-NEXT: v_dual_mov_b32 v1, s88 :: v_dual_mov_b32 v2, s89 +; GFX12-NEXT: v_dual_mov_b32 v3, s86 :: v_dual_mov_b32 v4, s87 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v5, s66 -; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v6, s67 :: v_dual_mov_b32 v7, s62 -; GFX12-NEXT: v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v9, s56 -; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX12-NEXT: v_dual_mov_b32 v10, s57 :: v_dual_mov_b32 v11, s52 +; GFX12-NEXT: v_mov_b32_e32 v5, s80 +; GFX12-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v6, s81 :: v_dual_mov_b32 v7, s76 +; GFX12-NEXT: v_dual_mov_b32 v8, s77 :: v_dual_mov_b32 v9, s70 +; GFX12-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 +; GFX12-NEXT: v_dual_mov_b32 v10, s71 :: v_dual_mov_b32 v11, s66 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v12, s53 :: v_dual_mov_b32 v13, s44 -; GFX12-NEXT: v_dual_mov_b32 v14, s45 :: v_dual_mov_b32 v15, s40 -; GFX12-NEXT: v_dual_mov_b32 v16, s41 :: v_dual_mov_b32 v17, s30 -; GFX12-NEXT: v_dual_mov_b32 v18, s31 :: v_dual_mov_b32 v19, s26 -; GFX12-NEXT: v_dual_mov_b32 v20, s27 :: v_dual_mov_b32 v21, s18 -; GFX12-NEXT: v_dual_mov_b32 v22, s19 :: v_dual_mov_b32 v23, s10 -; GFX12-NEXT: v_mov_b32_e32 v24, s11 +; GFX12-NEXT: v_dual_mov_b32 v12, s67 :: v_dual_mov_b32 v13, s60 +; GFX12-NEXT: v_dual_mov_b32 v14, s61 :: v_dual_mov_b32 v15, s56 +; GFX12-NEXT: v_dual_mov_b32 v16, s57 :: v_dual_mov_b32 v17, s50 +; GFX12-NEXT: v_dual_mov_b32 v18, s51 :: v_dual_mov_b32 v19, s48 +; GFX12-NEXT: v_dual_mov_b32 v20, s49 :: v_dual_mov_b32 v21, s40 +; GFX12-NEXT: v_dual_mov_b32 v22, s41 :: v_dual_mov_b32 v23, s2 +; GFX12-NEXT: v_mov_b32_e32 v24, s3 ; GFX12-NEXT: s_clause 0x5 ; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] offset:80 ; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:64 @@ -11207,74 +11215,75 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX1250-LABEL: constant_sextload_v64i1_to_v64i64: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[10:11], s[2:3], 0x0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_lshr_b32 s96, s11, 30 -; GFX1250-NEXT: s_lshr_b32 s98, s11, 31 -; GFX1250-NEXT: s_lshr_b32 s92, s11, 28 -; GFX1250-NEXT: s_lshr_b32 s94, s11, 29 -; GFX1250-NEXT: s_lshr_b32 s78, s11, 26 -; GFX1250-NEXT: s_lshr_b32 s88, s11, 27 +; GFX1250-NEXT: s_lshr_b32 s96, s3, 30 +; GFX1250-NEXT: s_lshr_b32 s98, s3, 31 +; GFX1250-NEXT: s_lshr_b32 s92, s3, 28 +; GFX1250-NEXT: s_lshr_b32 s94, s3, 29 +; GFX1250-NEXT: s_lshr_b32 s88, s3, 26 +; GFX1250-NEXT: s_lshr_b32 s90, s3, 27 ; GFX1250-NEXT: s_bfe_i64 s[96:97], s[96:97], 0x10000 ; GFX1250-NEXT: s_bfe_i64 s[100:101], s[98:99], 0x10000 -; GFX1250-NEXT: s_lshr_b32 s66, s11, 24 -; GFX1250-NEXT: s_lshr_b32 s74, s11, 25 +; GFX1250-NEXT: s_lshr_b32 s80, s3, 24 +; GFX1250-NEXT: s_lshr_b32 s86, s3, 25 ; GFX1250-NEXT: s_bfe_i64 s[92:93], s[92:93], 0x10000 ; GFX1250-NEXT: s_bfe_i64 s[94:95], s[94:95], 0x10000 ; GFX1250-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v0, s96 -; GFX1250-NEXT: s_lshr_b32 s56, s11, 22 -; GFX1250-NEXT: s_lshr_b32 s62, s11, 23 +; GFX1250-NEXT: s_lshr_b32 s70, s3, 22 +; GFX1250-NEXT: s_lshr_b32 s76, s3, 23 ; GFX1250-NEXT: v_dual_mov_b32 v1, s97 :: v_dual_mov_b32 v2, s100 ; GFX1250-NEXT: v_dual_mov_b32 v3, s101 :: v_dual_mov_b32 v4, s92 -; GFX1250-NEXT: s_bfe_i64 s[78:79], s[78:79], 0x10000 ; GFX1250-NEXT: s_bfe_i64 s[88:89], s[88:89], 0x10000 -; GFX1250-NEXT: s_lshr_b32 s44, s11, 20 -; GFX1250-NEXT: s_lshr_b32 s52, s11, 21 -; GFX1250-NEXT: s_lshr_b32 s30, s11, 18 -; GFX1250-NEXT: s_lshr_b32 s40, s11, 19 -; GFX1250-NEXT: s_lshr_b32 s18, s11, 16 -; GFX1250-NEXT: s_lshr_b32 s26, s11, 17 -; GFX1250-NEXT: s_lshr_b32 s2, s11, 14 -; GFX1250-NEXT: s_lshr_b32 s4, s11, 15 +; GFX1250-NEXT: s_bfe_i64 s[90:91], s[90:91], 0x10000 +; GFX1250-NEXT: s_lshr_b32 s60, s3, 20 +; GFX1250-NEXT: s_lshr_b32 s66, s3, 21 +; GFX1250-NEXT: s_lshr_b32 s50, s3, 18 +; GFX1250-NEXT: s_lshr_b32 s56, s3, 19 +; GFX1250-NEXT: s_lshr_b32 s40, s3, 16 +; GFX1250-NEXT: s_lshr_b32 s48, s3, 17 +; GFX1250-NEXT: s_lshr_b32 s6, s3, 14 +; GFX1250-NEXT: s_lshr_b32 s8, s3, 15 ; GFX1250-NEXT: v_dual_mov_b32 v5, s93 :: v_dual_mov_b32 v6, s94 -; GFX1250-NEXT: v_dual_mov_b32 v7, s95 :: v_dual_mov_b32 v10, s78 +; GFX1250-NEXT: v_dual_mov_b32 v7, s95 :: v_dual_mov_b32 v10, s88 +; GFX1250-NEXT: s_bfe_i64 s[80:81], s[80:81], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[86:87], s[86:87], 0x10000 +; GFX1250-NEXT: s_lshr_b32 s10, s3, 12 +; GFX1250-NEXT: s_lshr_b32 s12, s3, 13 +; GFX1250-NEXT: v_dual_mov_b32 v11, s89 :: v_dual_mov_b32 v12, s90 +; GFX1250-NEXT: v_dual_mov_b32 v13, s91 :: v_dual_mov_b32 v14, s80 +; GFX1250-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[76:77], s[76:77], 0x10000 +; GFX1250-NEXT: s_lshr_b32 s14, s3, 10 +; GFX1250-NEXT: s_lshr_b32 s16, s3, 11 +; GFX1250-NEXT: v_dual_mov_b32 v15, s81 :: v_dual_mov_b32 v16, s86 +; GFX1250-NEXT: v_dual_mov_b32 v17, s87 :: v_dual_mov_b32 v18, s70 +; GFX1250-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 ; GFX1250-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000 -; GFX1250-NEXT: s_lshr_b32 s6, s11, 12 -; GFX1250-NEXT: s_lshr_b32 s8, s11, 13 -; GFX1250-NEXT: v_dual_mov_b32 v11, s79 :: v_dual_mov_b32 v12, s88 -; GFX1250-NEXT: v_dual_mov_b32 v13, s89 :: v_dual_mov_b32 v14, s66 +; GFX1250-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 ; GFX1250-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 -; GFX1250-NEXT: s_lshr_b32 s12, s11, 10 -; GFX1250-NEXT: s_lshr_b32 s14, s11, 11 -; GFX1250-NEXT: v_dual_mov_b32 v15, s67 :: v_dual_mov_b32 v16, s74 -; GFX1250-NEXT: v_dual_mov_b32 v17, s75 :: v_dual_mov_b32 v18, s56 -; GFX1250-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX1250-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 -; GFX1250-NEXT: s_lshr_b32 s16, s11, 8 -; GFX1250-NEXT: s_lshr_b32 s20, s11, 9 -; GFX1250-NEXT: v_dual_mov_b32 v19, s57 :: v_dual_mov_b32 v20, s62 -; GFX1250-NEXT: v_dual_mov_b32 v21, s63 :: v_dual_mov_b32 v22, s44 +; GFX1250-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 ; GFX1250-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 ; GFX1250-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX1250-NEXT: s_lshr_b32 s22, s11, 6 -; GFX1250-NEXT: s_lshr_b32 s24, s11, 7 -; GFX1250-NEXT: v_dual_mov_b32 v23, s45 :: v_dual_mov_b32 v24, s52 -; GFX1250-NEXT: v_dual_mov_b32 v25, s53 :: v_dual_mov_b32 v26, s30 -; GFX1250-NEXT: v_dual_mov_b32 v27, s31 :: v_dual_mov_b32 v28, s40 -; GFX1250-NEXT: v_dual_mov_b32 v29, s41 :: v_dual_mov_b32 v30, s18 -; GFX1250-NEXT: v_dual_mov_b32 v31, s19 :: v_dual_mov_b32 v32, s26 -; GFX1250-NEXT: v_mov_b32_e32 v33, s27 -; GFX1250-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 +; GFX1250-NEXT: s_lshr_b32 s18, s3, 8 +; GFX1250-NEXT: s_lshr_b32 s20, s3, 9 +; GFX1250-NEXT: v_dual_mov_b32 v19, s71 :: v_dual_mov_b32 v20, s76 +; GFX1250-NEXT: v_dual_mov_b32 v21, s77 :: v_dual_mov_b32 v22, s60 ; GFX1250-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX1250-NEXT: s_lshr_b32 s22, s3, 6 +; GFX1250-NEXT: s_lshr_b32 s24, s3, 7 +; GFX1250-NEXT: v_dual_mov_b32 v23, s61 :: v_dual_mov_b32 v24, s66 +; GFX1250-NEXT: v_dual_mov_b32 v25, s67 :: v_dual_mov_b32 v26, s50 +; GFX1250-NEXT: v_dual_mov_b32 v27, s51 :: v_dual_mov_b32 v28, s56 +; GFX1250-NEXT: v_dual_mov_b32 v29, s57 :: v_dual_mov_b32 v30, s40 +; GFX1250-NEXT: v_dual_mov_b32 v31, s41 :: v_dual_mov_b32 v32, s48 +; GFX1250-NEXT: v_mov_b32_e32 v33, s49 +; GFX1250-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 ; GFX1250-NEXT: s_clause 0x7 ; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:496 ; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:480 @@ -11285,48 +11294,48 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX1250-NEXT: global_store_b128 v8, v[26:29], s[0:1] offset:400 ; GFX1250-NEXT: global_store_b128 v8, v[30:33], s[0:1] offset:384 ; GFX1250-NEXT: s_wait_xcnt 0x7 -; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX1250-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1250-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX1250-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v3, s9 ; GFX1250-NEXT: s_wait_xcnt 0x6 -; GFX1250-NEXT: v_mov_b32_e32 v4, s6 -; GFX1250-NEXT: s_lshr_b32 s28, s11, 4 -; GFX1250-NEXT: s_lshr_b32 s34, s11, 5 -; GFX1250-NEXT: s_lshr_b32 s36, s11, 2 -; GFX1250-NEXT: s_lshr_b32 s38, s11, 3 +; GFX1250-NEXT: v_mov_b32_e32 v4, s10 +; GFX1250-NEXT: s_lshr_b32 s26, s3, 4 +; GFX1250-NEXT: s_lshr_b32 s28, s3, 5 +; GFX1250-NEXT: s_lshr_b32 s30, s3, 2 +; GFX1250-NEXT: s_lshr_b32 s34, s3, 3 ; GFX1250-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v5, s7 :: v_dual_mov_b32 v6, s8 +; GFX1250-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX1250-NEXT: v_dual_mov_b32 v5, s11 :: v_dual_mov_b32 v6, s12 ; GFX1250-NEXT: s_wait_xcnt 0x5 -; GFX1250-NEXT: v_dual_mov_b32 v7, s9 :: v_dual_mov_b32 v10, s12 -; GFX1250-NEXT: s_lshr_b32 s42, s11, 1 -; GFX1250-NEXT: s_mov_b32 s46, s11 +; GFX1250-NEXT: v_dual_mov_b32 v7, s13 :: v_dual_mov_b32 v10, s14 +; GFX1250-NEXT: s_lshr_b32 s36, s3, 1 +; GFX1250-NEXT: s_mov_b32 s4, s3 ; GFX1250-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX1250-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v11, s13 :: v_dual_mov_b32 v12, s14 +; GFX1250-NEXT: v_dual_mov_b32 v11, s15 :: v_dual_mov_b32 v12, s16 ; GFX1250-NEXT: s_wait_xcnt 0x4 -; GFX1250-NEXT: v_dual_mov_b32 v13, s15 :: v_dual_mov_b32 v14, s16 -; GFX1250-NEXT: s_lshr_b32 s48, s10, 30 -; GFX1250-NEXT: s_lshr_b32 s50, s10, 31 -; GFX1250-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX1250-NEXT: v_dual_mov_b32 v13, s17 :: v_dual_mov_b32 v14, s18 +; GFX1250-NEXT: s_lshr_b32 s38, s2, 30 +; GFX1250-NEXT: s_lshr_b32 s42, s2, 31 ; GFX1250-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX1250-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v15, s17 :: v_dual_mov_b32 v16, s20 +; GFX1250-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX1250-NEXT: v_dual_mov_b32 v15, s19 :: v_dual_mov_b32 v16, s20 ; GFX1250-NEXT: s_wait_xcnt 0x3 ; GFX1250-NEXT: v_dual_mov_b32 v17, s21 :: v_dual_mov_b32 v18, s22 -; GFX1250-NEXT: s_lshr_b32 s54, s10, 28 -; GFX1250-NEXT: s_lshr_b32 s58, s10, 29 -; GFX1250-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX1250-NEXT: s_lshr_b32 s44, s2, 28 +; GFX1250-NEXT: s_lshr_b32 s46, s2, 29 +; GFX1250-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 ; GFX1250-NEXT: v_dual_mov_b32 v19, s23 :: v_dual_mov_b32 v20, s24 ; GFX1250-NEXT: s_wait_xcnt 0x2 -; GFX1250-NEXT: v_dual_mov_b32 v21, s25 :: v_dual_mov_b32 v22, s28 -; GFX1250-NEXT: s_lshr_b32 s60, s10, 26 -; GFX1250-NEXT: s_lshr_b32 s64, s10, 27 -; GFX1250-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v23, s29 :: v_dual_mov_b32 v24, s34 -; GFX1250-NEXT: v_mov_b32_e32 v25, s35 +; GFX1250-NEXT: v_dual_mov_b32 v21, s25 :: v_dual_mov_b32 v22, s26 +; GFX1250-NEXT: s_lshr_b32 s52, s2, 26 +; GFX1250-NEXT: s_lshr_b32 s54, s2, 27 +; GFX1250-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX1250-NEXT: v_dual_mov_b32 v23, s27 :: v_dual_mov_b32 v24, s28 +; GFX1250-NEXT: v_mov_b32_e32 v25, s29 ; GFX1250-NEXT: s_clause 0x5 ; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:368 ; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:352 @@ -11335,55 +11344,55 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX1250-NEXT: global_store_b128 v8, v[18:21], s[0:1] offset:304 ; GFX1250-NEXT: global_store_b128 v8, v[22:25], s[0:1] offset:288 ; GFX1250-NEXT: s_wait_xcnt 0x5 -; GFX1250-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37 -; GFX1250-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39 +; GFX1250-NEXT: v_dual_mov_b32 v0, s30 :: v_dual_mov_b32 v1, s31 +; GFX1250-NEXT: v_dual_mov_b32 v2, s34 :: v_dual_mov_b32 v3, s35 ; GFX1250-NEXT: s_wait_xcnt 0x4 -; GFX1250-NEXT: v_mov_b32_e32 v4, s46 -; GFX1250-NEXT: s_lshr_b32 s68, s10, 24 -; GFX1250-NEXT: s_lshr_b32 s70, s10, 25 -; GFX1250-NEXT: s_lshr_b32 s72, s10, 22 -; GFX1250-NEXT: s_lshr_b32 s76, s10, 23 -; GFX1250-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v5, s47 :: v_dual_mov_b32 v6, s42 +; GFX1250-NEXT: v_mov_b32_e32 v4, s4 +; GFX1250-NEXT: s_lshr_b32 s58, s2, 24 +; GFX1250-NEXT: s_lshr_b32 s62, s2, 25 +; GFX1250-NEXT: s_lshr_b32 s64, s2, 22 +; GFX1250-NEXT: s_lshr_b32 s68, s2, 23 +; GFX1250-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX1250-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v6, s36 ; GFX1250-NEXT: s_wait_xcnt 0x3 -; GFX1250-NEXT: v_dual_mov_b32 v7, s43 :: v_dual_mov_b32 v10, s48 -; GFX1250-NEXT: s_lshr_b32 s80, s10, 20 -; GFX1250-NEXT: s_lshr_b32 s82, s10, 21 -; GFX1250-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v11, s49 :: v_dual_mov_b32 v12, s50 +; GFX1250-NEXT: v_dual_mov_b32 v7, s37 :: v_dual_mov_b32 v10, s38 +; GFX1250-NEXT: s_lshr_b32 s72, s2, 20 +; GFX1250-NEXT: s_lshr_b32 s74, s2, 21 +; GFX1250-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 +; GFX1250-NEXT: v_dual_mov_b32 v11, s39 :: v_dual_mov_b32 v12, s42 ; GFX1250-NEXT: s_wait_xcnt 0x2 -; GFX1250-NEXT: v_dual_mov_b32 v13, s51 :: v_dual_mov_b32 v14, s54 -; GFX1250-NEXT: s_lshr_b32 s84, s10, 18 -; GFX1250-NEXT: s_lshr_b32 s86, s10, 19 -; GFX1250-NEXT: s_bfe_i64 s[76:77], s[76:77], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000 +; GFX1250-NEXT: v_dual_mov_b32 v13, s43 :: v_dual_mov_b32 v14, s44 +; GFX1250-NEXT: s_lshr_b32 s78, s2, 18 +; GFX1250-NEXT: s_lshr_b32 s82, s2, 19 ; GFX1250-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v15, s55 :: v_dual_mov_b32 v16, s58 +; GFX1250-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000 +; GFX1250-NEXT: v_dual_mov_b32 v15, s45 :: v_dual_mov_b32 v16, s46 ; GFX1250-NEXT: s_wait_xcnt 0x1 -; GFX1250-NEXT: v_dual_mov_b32 v17, s59 :: v_dual_mov_b32 v18, s60 -; GFX1250-NEXT: s_lshr_b32 s90, s10, 16 -; GFX1250-NEXT: s_lshr_b32 s98, s10, 17 -; GFX1250-NEXT: s_bfe_i64 s[82:83], s[82:83], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[80:81], s[80:81], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v19, s61 :: v_dual_mov_b32 v20, s64 +; GFX1250-NEXT: v_dual_mov_b32 v17, s47 :: v_dual_mov_b32 v18, s52 +; GFX1250-NEXT: s_lshr_b32 s84, s2, 16 +; GFX1250-NEXT: s_lshr_b32 s98, s2, 17 +; GFX1250-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000 +; GFX1250-NEXT: v_dual_mov_b32 v19, s53 :: v_dual_mov_b32 v20, s54 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v21, s65 :: v_dual_mov_b32 v22, s68 -; GFX1250-NEXT: s_lshr_b32 s96, s10, 14 -; GFX1250-NEXT: s_lshr_b32 s100, s10, 15 -; GFX1250-NEXT: s_lshr_b32 s94, s10, 13 -; GFX1250-NEXT: s_lshr_b32 s88, s10, 11 -; GFX1250-NEXT: s_lshr_b32 s74, s10, 9 -; GFX1250-NEXT: s_lshr_b32 s62, s10, 7 -; GFX1250-NEXT: s_lshr_b32 s52, s10, 5 -; GFX1250-NEXT: s_lshr_b32 s40, s10, 3 -; GFX1250-NEXT: s_lshr_b32 s26, s10, 1 -; GFX1250-NEXT: s_bfe_i64 s[86:87], s[86:87], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[84:85], s[84:85], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v23, s69 :: v_dual_mov_b32 v24, s70 -; GFX1250-NEXT: v_mov_b32_e32 v25, s71 +; GFX1250-NEXT: v_dual_mov_b32 v21, s55 :: v_dual_mov_b32 v22, s58 +; GFX1250-NEXT: s_lshr_b32 s96, s2, 14 +; GFX1250-NEXT: s_lshr_b32 s100, s2, 15 +; GFX1250-NEXT: s_lshr_b32 s94, s2, 13 +; GFX1250-NEXT: s_lshr_b32 s90, s2, 11 +; GFX1250-NEXT: s_lshr_b32 s86, s2, 9 +; GFX1250-NEXT: s_lshr_b32 s76, s2, 7 +; GFX1250-NEXT: s_lshr_b32 s66, s2, 5 +; GFX1250-NEXT: s_lshr_b32 s56, s2, 3 +; GFX1250-NEXT: s_lshr_b32 s48, s2, 1 +; GFX1250-NEXT: s_bfe_i64 s[82:83], s[82:83], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[78:79], s[78:79], 0x10000 +; GFX1250-NEXT: v_dual_mov_b32 v23, s59 :: v_dual_mov_b32 v24, s62 +; GFX1250-NEXT: v_mov_b32_e32 v25, s63 ; GFX1250-NEXT: s_clause 0x5 ; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:272 ; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:256 @@ -11392,46 +11401,46 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX1250-NEXT: global_store_b128 v8, v[18:21], s[0:1] offset:208 ; GFX1250-NEXT: global_store_b128 v8, v[22:25], s[0:1] offset:192 ; GFX1250-NEXT: s_wait_xcnt 0x5 -; GFX1250-NEXT: v_dual_mov_b32 v0, s72 :: v_dual_mov_b32 v1, s73 -; GFX1250-NEXT: v_dual_mov_b32 v2, s76 :: v_dual_mov_b32 v3, s77 +; GFX1250-NEXT: v_dual_mov_b32 v0, s64 :: v_dual_mov_b32 v1, s65 +; GFX1250-NEXT: v_dual_mov_b32 v2, s68 :: v_dual_mov_b32 v3, s69 ; GFX1250-NEXT: s_wait_xcnt 0x4 -; GFX1250-NEXT: v_mov_b32_e32 v4, s80 -; GFX1250-NEXT: s_lshr_b32 s92, s10, 12 -; GFX1250-NEXT: s_lshr_b32 s78, s10, 10 +; GFX1250-NEXT: v_mov_b32_e32 v4, s72 +; GFX1250-NEXT: s_lshr_b32 s92, s2, 12 +; GFX1250-NEXT: s_lshr_b32 s88, s2, 10 ; GFX1250-NEXT: s_bfe_i64 s[98:99], s[98:99], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[90:91], s[90:91], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v5, s81 :: v_dual_mov_b32 v6, s82 +; GFX1250-NEXT: s_bfe_i64 s[84:85], s[84:85], 0x10000 +; GFX1250-NEXT: v_dual_mov_b32 v5, s73 :: v_dual_mov_b32 v6, s74 ; GFX1250-NEXT: s_wait_xcnt 0x3 -; GFX1250-NEXT: v_dual_mov_b32 v7, s83 :: v_dual_mov_b32 v10, s84 -; GFX1250-NEXT: s_lshr_b32 s66, s10, 8 -; GFX1250-NEXT: s_lshr_b32 s56, s10, 6 -; GFX1250-NEXT: s_lshr_b32 s44, s10, 4 -; GFX1250-NEXT: s_lshr_b32 s30, s10, 2 -; GFX1250-NEXT: s_bfe_i64 s[18:19], s[10:11], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[10:11], s[26:27], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[26:27], s[40:41], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[40:41], s[52:53], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[52:53], s[62:63], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[62:63], s[74:75], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[74:75], s[88:89], 0x10000 -; GFX1250-NEXT: s_bfe_i64 s[88:89], s[94:95], 0x10000 +; GFX1250-NEXT: v_dual_mov_b32 v7, s75 :: v_dual_mov_b32 v10, s78 +; GFX1250-NEXT: s_lshr_b32 s80, s2, 8 +; GFX1250-NEXT: s_lshr_b32 s70, s2, 6 +; GFX1250-NEXT: s_lshr_b32 s60, s2, 4 +; GFX1250-NEXT: s_lshr_b32 s50, s2, 2 +; GFX1250-NEXT: s_bfe_i64 s[40:41], s[2:3], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[2:3], s[48:49], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[48:49], s[56:57], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[56:57], s[66:67], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[66:67], s[76:77], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[76:77], s[86:87], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[86:87], s[90:91], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[90:91], s[94:95], 0x10000 ; GFX1250-NEXT: s_bfe_i64 s[94:95], s[100:101], 0x10000 ; GFX1250-NEXT: s_bfe_i64 s[96:97], s[96:97], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v11, s85 :: v_dual_mov_b32 v12, s86 +; GFX1250-NEXT: v_dual_mov_b32 v11, s79 :: v_dual_mov_b32 v12, s82 ; GFX1250-NEXT: s_wait_xcnt 0x2 -; GFX1250-NEXT: v_dual_mov_b32 v13, s87 :: v_dual_mov_b32 v14, s90 -; GFX1250-NEXT: s_bfe_i64 s[78:79], s[78:79], 0x10000 +; GFX1250-NEXT: v_dual_mov_b32 v13, s83 :: v_dual_mov_b32 v14, s84 +; GFX1250-NEXT: s_bfe_i64 s[88:89], s[88:89], 0x10000 ; GFX1250-NEXT: s_bfe_i64 s[92:93], s[92:93], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v15, s91 :: v_dual_mov_b32 v16, s98 +; GFX1250-NEXT: v_dual_mov_b32 v15, s85 :: v_dual_mov_b32 v16, s98 ; GFX1250-NEXT: s_wait_xcnt 0x1 ; GFX1250-NEXT: v_dual_mov_b32 v17, s99 :: v_dual_mov_b32 v18, s96 -; GFX1250-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000 +; GFX1250-NEXT: s_bfe_i64 s[80:81], s[80:81], 0x10000 ; GFX1250-NEXT: v_dual_mov_b32 v19, s97 :: v_dual_mov_b32 v20, s94 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v21, s95 :: v_dual_mov_b32 v22, s92 -; GFX1250-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v23, s93 :: v_dual_mov_b32 v24, s88 -; GFX1250-NEXT: v_mov_b32_e32 v25, s89 +; GFX1250-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000 +; GFX1250-NEXT: v_dual_mov_b32 v23, s93 :: v_dual_mov_b32 v24, s90 +; GFX1250-NEXT: v_mov_b32_e32 v25, s91 ; GFX1250-NEXT: s_clause 0x5 ; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:176 ; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:160 @@ -11440,26 +11449,26 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX1250-NEXT: global_store_b128 v8, v[18:21], s[0:1] offset:112 ; GFX1250-NEXT: global_store_b128 v8, v[22:25], s[0:1] offset:96 ; GFX1250-NEXT: s_wait_xcnt 0x5 -; GFX1250-NEXT: v_dual_mov_b32 v0, s78 :: v_dual_mov_b32 v1, s79 -; GFX1250-NEXT: v_dual_mov_b32 v2, s74 :: v_dual_mov_b32 v3, s75 +; GFX1250-NEXT: v_dual_mov_b32 v0, s88 :: v_dual_mov_b32 v1, s89 +; GFX1250-NEXT: v_dual_mov_b32 v2, s86 :: v_dual_mov_b32 v3, s87 ; GFX1250-NEXT: s_wait_xcnt 0x4 -; GFX1250-NEXT: v_mov_b32_e32 v4, s66 -; GFX1250-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v5, s67 :: v_dual_mov_b32 v6, s62 +; GFX1250-NEXT: v_mov_b32_e32 v4, s80 +; GFX1250-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 +; GFX1250-NEXT: v_dual_mov_b32 v5, s81 :: v_dual_mov_b32 v6, s76 ; GFX1250-NEXT: s_wait_xcnt 0x3 -; GFX1250-NEXT: v_dual_mov_b32 v7, s63 :: v_dual_mov_b32 v10, s56 -; GFX1250-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX1250-NEXT: v_dual_mov_b32 v11, s57 :: v_dual_mov_b32 v12, s52 +; GFX1250-NEXT: v_dual_mov_b32 v7, s77 :: v_dual_mov_b32 v10, s70 +; GFX1250-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 +; GFX1250-NEXT: v_dual_mov_b32 v11, s71 :: v_dual_mov_b32 v12, s66 ; GFX1250-NEXT: s_wait_xcnt 0x2 -; GFX1250-NEXT: v_dual_mov_b32 v13, s53 :: v_dual_mov_b32 v14, s44 -; GFX1250-NEXT: v_dual_mov_b32 v15, s45 :: v_dual_mov_b32 v16, s40 +; GFX1250-NEXT: v_dual_mov_b32 v13, s67 :: v_dual_mov_b32 v14, s60 +; GFX1250-NEXT: v_dual_mov_b32 v15, s61 :: v_dual_mov_b32 v16, s56 ; GFX1250-NEXT: s_wait_xcnt 0x1 -; GFX1250-NEXT: v_dual_mov_b32 v17, s41 :: v_dual_mov_b32 v18, s30 -; GFX1250-NEXT: v_dual_mov_b32 v19, s31 :: v_dual_mov_b32 v20, s26 +; GFX1250-NEXT: v_dual_mov_b32 v17, s57 :: v_dual_mov_b32 v18, s50 +; GFX1250-NEXT: v_dual_mov_b32 v19, s51 :: v_dual_mov_b32 v20, s48 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v21, s27 :: v_dual_mov_b32 v22, s18 -; GFX1250-NEXT: v_dual_mov_b32 v23, s19 :: v_dual_mov_b32 v24, s10 -; GFX1250-NEXT: v_mov_b32_e32 v25, s11 +; GFX1250-NEXT: v_dual_mov_b32 v21, s49 :: v_dual_mov_b32 v22, s40 +; GFX1250-NEXT: v_dual_mov_b32 v23, s41 :: v_dual_mov_b32 v24, s2 +; GFX1250-NEXT: v_mov_b32_e32 v25, s3 ; GFX1250-NEXT: s_clause 0x5 ; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:80 ; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:64 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index a135b43bad0fe..a56360fc8fcbc 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -6177,6 +6177,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-SI-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, s5 @@ -6203,6 +6204,7 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6236,28 +6238,29 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-LABEL: constant_sextload_v4i16_to_v4i64: ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN-NOHSA-VI-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[2:3], 0x100000 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s2, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s3 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s3 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s3, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s0, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s1, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 @@ -6298,19 +6301,20 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; GFX12-LABEL: constant_sextload_v4i16_to_v4i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s6, s3 +; GFX12-NEXT: s_mov_b32 s4, s3 ; GFX12-NEXT: s_lshr_b32 s8, s3, 16 -; GFX12-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[2:3], 0x100000 ; GFX12-NEXT: s_lshr_b32 s2, s2, 16 -; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 ; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 ; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 -; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v5, s7 -; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v7, s9 +; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v5, s5 +; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s9 ; GFX12-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: v_mov_b32_e32 v2, s2 ; GFX12-NEXT: s_clause 0x1 @@ -6542,26 +6546,28 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-SI-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: ; implicit-def: $sgpr10_sgpr11 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s7 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s6, 16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s5 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s6, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s5 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[4:5], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[6:7], 0x100000 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s13, s5, 31 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s15, s5, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[12:13], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s12, s7, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s7, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[10:11], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s7, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s21, s7, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[8:9], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[14:15], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[12:13], 0x100000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 @@ -6586,22 +6592,24 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GCN-HSA-NEXT: ; implicit-def: $sgpr2_sgpr3 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_mov_b32 s2, s7 -; GCN-HSA-NEXT: s_lshr_b32 s8, s6, 16 -; GCN-HSA-NEXT: s_mov_b32 s10, s5 +; GCN-HSA-NEXT: s_lshr_b32 s10, s6, 16 +; GCN-HSA-NEXT: s_mov_b32 s8, s5 ; GCN-HSA-NEXT: s_lshr_b32 s12, s4, 16 ; GCN-HSA-NEXT: s_ashr_i32 s13, s5, 16 ; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[4:5], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x100000 ; GCN-HSA-NEXT: s_ashr_i32 s6, s5, 31 ; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[12:13], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 ; GCN-HSA-NEXT: s_ashr_i32 s12, s7, 31 ; GCN-HSA-NEXT: s_ashr_i32 s7, s7, 16 ; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 @@ -6621,13 +6629,13 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 @@ -6644,30 +6652,32 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-LABEL: constant_sextload_v8i16_to_v8i64: ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN-NOHSA-VI-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GCN-NOHSA-VI-NEXT: ; implicit-def: $sgpr2_sgpr3 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s5, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[6:7], 0x100000 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[6:7], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s7 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[4:5], 0x100000 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s5 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s5, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, s5 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[8:9], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s7 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s7, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7 ; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 32 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -6675,22 +6685,23 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7 -; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 16 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0 @@ -6750,31 +6761,33 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GFX12-LABEL: constant_sextload_v8i16_to_v8i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX12-NEXT: ; implicit-def: $sgpr2_sgpr3 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s14, s7 +; GFX12-NEXT: s_mov_b32 s8, s7 ; GFX12-NEXT: s_lshr_b32 s16, s7, 16 -; GFX12-NEXT: s_bfe_i64 s[12:13], s[6:7], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[14:15], s[6:7], 0x100000 ; GFX12-NEXT: s_lshr_b32 s6, s6, 16 -; GFX12-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x100000 -; GFX12-NEXT: s_mov_b32 s8, s5 -; GFX12-NEXT: s_lshr_b32 s10, s5, 16 -; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[4:5], 0x100000 +; GFX12-NEXT: s_mov_b32 s2, s5 +; GFX12-NEXT: s_lshr_b32 s12, s5, 16 +; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 ; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000 ; GFX12-NEXT: s_lshr_b32 s4, s4, 16 ; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 -; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s13 -; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 -; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 -; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v9, s15 -; GFX12-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v11, s17 +; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s15 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 +; GFX12-NEXT: v_dual_mov_b32 v4, s10 :: v_dual_mov_b32 v9, s9 +; GFX12-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s17 ; GFX12-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v3, s7 ; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 -; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v5, s3 -; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v13, s9 -; GFX12-NEXT: v_dual_mov_b32 v12, s8 :: v_dual_mov_b32 v15, s11 -; GFX12-NEXT: v_dual_mov_b32 v14, s10 :: v_dual_mov_b32 v7, s5 +; GFX12-NEXT: v_dual_mov_b32 v0, s14 :: v_dual_mov_b32 v5, s11 +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v13, s3 +; GFX12-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v15, s13 +; GFX12-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v7, s5 ; GFX12-NEXT: v_mov_b32_e32 v6, s4 ; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:48 @@ -7164,15 +7177,19 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NOHSA-SI-NEXT: ; implicit-def: $sgpr12_sgpr13 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, -1 +; GCN-NOHSA-SI-NEXT: ; implicit-def: $sgpr14_sgpr15 +; GCN-NOHSA-SI-NEXT: ; implicit-def: $sgpr16_sgpr17 +; GCN-NOHSA-SI-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s7 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s6, 16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s16, s5 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s4, 16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s3 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s2, 16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s1 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s6, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, s5 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s4, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s16, s3 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s2, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s1 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s0, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[0:1], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[2:3], 0x100000 @@ -7180,40 +7197,40 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x100000 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s21, s1, 31 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s1, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[0:1], s[24:25], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s24, s3, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s3, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[2:3], s[20:21], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s5, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s5, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[16:17], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s7, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s7, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[0:1], s[18:19], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s3, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s3, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[2:3], s[16:17], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s5, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s5, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s7, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s40, s7, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[12:13], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[26:27], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[24:25], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[22:23], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[20:21], 0x100000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s39 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s27 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s33 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[8:11], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s34 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s35 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s2 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s3 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s25 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s25 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[8:11], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s30 @@ -7226,14 +7243,14 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s28 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s19 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:96 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s17 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s15 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s12 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s13 @@ -7244,21 +7261,25 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 -; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: ; implicit-def: $sgpr12_sgpr13 +; GCN-HSA-NEXT: ; implicit-def: $sgpr14_sgpr15 +; GCN-HSA-NEXT: ; implicit-def: $sgpr16_sgpr17 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GCN-HSA-NEXT: ; implicit-def: $sgpr10_sgpr11 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_mov_b32 s10, s7 -; GCN-HSA-NEXT: s_lshr_b32 s12, s6, 16 -; GCN-HSA-NEXT: s_mov_b32 s14, s5 -; GCN-HSA-NEXT: s_lshr_b32 s16, s4, 16 +; GCN-HSA-NEXT: s_mov_b32 s12, s5 +; GCN-HSA-NEXT: s_mov_b32 s14, s3 +; GCN-HSA-NEXT: s_mov_b32 s16, s1 ; GCN-HSA-NEXT: s_ashr_i32 s25, s1, 31 ; GCN-HSA-NEXT: s_ashr_i32 s29, s3, 31 ; GCN-HSA-NEXT: s_ashr_i32 s30, s3, 16 -; GCN-HSA-NEXT: s_mov_b32 s18, s3 -; GCN-HSA-NEXT: s_lshr_b32 s20, s2, 16 -; GCN-HSA-NEXT: s_mov_b32 s22, s1 +; GCN-HSA-NEXT: s_lshr_b32 s18, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s20, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s22, s2, 16 ; GCN-HSA-NEXT: s_lshr_b32 s24, s0, 16 ; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[2:3], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[10:11], 0x100000 @@ -7273,12 +7294,12 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[24:25], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[22:23], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[16:17], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[22:23], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000 ; GCN-HSA-NEXT: s_add_u32 s22, s8, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s23, s9, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 @@ -7292,14 +7313,14 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s34 ; GCN-HSA-NEXT: s_add_u32 s6, s8, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s19 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GCN-HSA-NEXT: s_addc_u32 s7, s9, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s33 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 @@ -7310,15 +7331,15 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s8, 48 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 @@ -7328,8 +7349,8 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s17 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s8, 16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -7353,6 +7374,10 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-NOHSA-VI-LABEL: constant_sextload_v16i16_to_v16i64: ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GCN-NOHSA-VI-NEXT: ; implicit-def: $sgpr12_sgpr13 +; GCN-NOHSA-VI-NEXT: ; implicit-def: $sgpr20_sgpr21 +; GCN-NOHSA-VI-NEXT: ; implicit-def: $sgpr26_sgpr27 +; GCN-NOHSA-VI-NEXT: ; implicit-def: $sgpr34_sgpr35 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -7364,25 +7389,25 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[4:5], 0x100000 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[12:13], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[18:19], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[2:3], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, s3 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s7 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s5 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s20, s3 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s3, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s26, s7 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s7, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s34, s5 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s5, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s8, 0x50 @@ -7546,65 +7571,69 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; ; GFX12-LABEL: constant_sextload_v16i16_to_v16i64: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX12-NEXT: ; implicit-def: $sgpr18_sgpr19 +; GFX12-NEXT: ; implicit-def: $sgpr16_sgpr17 +; GFX12-NEXT: ; implicit-def: $sgpr14_sgpr15 +; GFX12-NEXT: ; implicit-def: $sgpr12_sgpr13 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0 +; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s30, s9 -; GFX12-NEXT: s_lshr_b32 s34, s9, 16 -; GFX12-NEXT: s_bfe_i64 s[28:29], s[8:9], 0x100000 -; GFX12-NEXT: s_lshr_b32 s8, s8, 16 -; GFX12-NEXT: s_bfe_i64 s[22:23], s[10:11], 0x100000 -; GFX12-NEXT: s_mov_b32 s24, s11 -; GFX12-NEXT: s_lshr_b32 s26, s11, 16 -; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x100000 +; GFX12-NEXT: s_mov_b32 s18, s5 +; GFX12-NEXT: s_lshr_b32 s34, s5, 16 +; GFX12-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x100000 +; GFX12-NEXT: s_lshr_b32 s4, s4, 16 +; GFX12-NEXT: s_bfe_i64 s[26:27], s[6:7], 0x100000 +; GFX12-NEXT: s_mov_b32 s16, s7 +; GFX12-NEXT: s_lshr_b32 s28, s7, 16 +; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000 ; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x100000 -; GFX12-NEXT: s_lshr_b32 s10, s10, 16 -; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 -; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s29 -; GFX12-NEXT: s_mov_b32 s18, s7 -; GFX12-NEXT: s_lshr_b32 s20, s7, 16 -; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 -; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 -; GFX12-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v9, s31 -; GFX12-NEXT: v_dual_mov_b32 v8, s30 :: v_dual_mov_b32 v11, s35 -; GFX12-NEXT: v_dual_mov_b32 v10, s34 :: v_dual_mov_b32 v3, s9 -; GFX12-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x100000 ; GFX12-NEXT: s_lshr_b32 s6, s6, 16 -; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 -; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v5, s23 -; GFX12-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v13, s25 -; GFX12-NEXT: s_mov_b32 s12, s5 -; GFX12-NEXT: s_lshr_b32 s14, s5, 16 -; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000 -; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000 -; GFX12-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 -; GFX12-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v7, s11 -; GFX12-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x100000 -; GFX12-NEXT: s_lshr_b32 s4, s4, 16 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s31 +; GFX12-NEXT: s_mov_b32 s14, s3 +; GFX12-NEXT: s_lshr_b32 s24, s3, 16 +; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 +; GFX12-NEXT: v_dual_mov_b32 v4, s26 :: v_dual_mov_b32 v9, s19 +; GFX12-NEXT: v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v11, s35 +; GFX12-NEXT: v_dual_mov_b32 v10, s34 :: v_dual_mov_b32 v3, s5 +; GFX12-NEXT: s_bfe_i64 s[22:23], s[2:3], 0x100000 +; GFX12-NEXT: s_lshr_b32 s2, s2, 16 ; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 -; GFX12-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v17, s19 -; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 +; GFX12-NEXT: v_dual_mov_b32 v0, s30 :: v_dual_mov_b32 v5, s27 +; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v13, s17 +; GFX12-NEXT: s_mov_b32 s12, s1 +; GFX12-NEXT: s_lshr_b32 s20, s1, 16 ; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 -; GFX12-NEXT: v_dual_mov_b32 v16, s18 :: v_dual_mov_b32 v19, s21 -; GFX12-NEXT: v_mov_b32_e32 v18, s20 -; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 +; GFX12-NEXT: v_dual_mov_b32 v12, s16 :: v_dual_mov_b32 v15, s29 +; GFX12-NEXT: v_dual_mov_b32 v14, s28 :: v_dual_mov_b32 v7, s7 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x100000 +; GFX12-NEXT: s_lshr_b32 s0, s0, 16 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 +; GFX12-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v17, s15 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000 +; GFX12-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v19, s25 +; GFX12-NEXT: v_mov_b32_e32 v18, s24 +; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:80 -; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:64 -; GFX12-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v3, s7 -; GFX12-NEXT: v_dual_mov_b32 v1, s17 :: v_dual_mov_b32 v2, s6 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:80 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:64 +; GFX12-NEXT: v_dual_mov_b32 v0, s22 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: v_dual_mov_b32 v1, s23 :: v_dual_mov_b32 v2, s2 ; GFX12-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v8, s12 -; GFX12-NEXT: v_dual_mov_b32 v11, s15 :: v_dual_mov_b32 v10, s14 -; GFX12-NEXT: v_dual_mov_b32 v21, s3 :: v_dual_mov_b32 v20, s2 -; GFX12-NEXT: v_dual_mov_b32 v23, s5 :: v_dual_mov_b32 v22, s4 +; GFX12-NEXT: v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v10, s20 +; GFX12-NEXT: v_dual_mov_b32 v21, s11 :: v_dual_mov_b32 v20, s10 +; GFX12-NEXT: v_dual_mov_b32 v23, s1 :: v_dual_mov_b32 v22, s0 ; GFX12-NEXT: s_clause 0x5 -; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:112 -; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:96 -; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:48 -; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:32 -; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:112 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:96 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[8:9] offset:48 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:32 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:16 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[8:9] ; GFX12-NEXT: s_endpgm %load = load <16 x i16>, ptr addrspace(4) %in %ext = sext <16 x i16> %load to <16 x i64> @@ -8309,11 +8338,23 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; GCN-NOHSA-SI-NEXT: ; implicit-def: $sgpr18_sgpr19 +; GCN-NOHSA-SI-NEXT: ; implicit-def: $sgpr20_sgpr21 +; GCN-NOHSA-SI-NEXT: ; implicit-def: $sgpr48_sgpr49 +; GCN-NOHSA-SI-NEXT: ; implicit-def: $sgpr50_sgpr51 +; GCN-NOHSA-SI-NEXT: ; implicit-def: $sgpr46_sgpr47 +; GCN-NOHSA-SI-NEXT: ; implicit-def: $sgpr36_sgpr37 +; GCN-NOHSA-SI-NEXT: ; implicit-def: $sgpr42_sgpr43 +; GCN-NOHSA-SI-NEXT: ; implicit-def: $sgpr44_sgpr45 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s15 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s13 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s11 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s48, s11 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s50, s9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s46, s7 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s36, s5 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s42, s3 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s1 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s1, 31 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s1, 16 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s3, 31 @@ -8321,31 +8362,25 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s5, 31 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s5, 16 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s7, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s37, s7, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s9, 31 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[48:49], s[24:25], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[22:23], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s7, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[52:53], s[20:21], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[54:55], s[18:19], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s43, s9, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s45, s11, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s47, s11, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s13, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s56, s13, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s57, s15, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s58, s15, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s14, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s8, 16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s46, s7 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s6, 16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s5 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s4, 16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s42, s3 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s2, 16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s40, s1 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s0, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s9, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s56, s9, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s57, s11, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s58, s11, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s59, s13, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s60, s13, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s61, s15, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s62, s15, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s0, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 @@ -8364,99 +8399,101 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s53 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s50 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s51 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s49 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s58 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s57 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s56 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s41 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[46:47], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[44:45], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[42:43], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s47 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s45 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s43 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s39 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[48:49], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[50:51], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[46:47], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s62 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s61 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s60 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s59 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s58 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s57 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s14 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s37 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s15 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s14 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s33 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s31 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s56 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s41 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s17 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s39 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s35 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s29 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[38:39], 0x100000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s27 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s40 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s41 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s33 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s31 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s42 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s27 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s44 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s45 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s25 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s23 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s9 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[36:37], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[34:35], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s9 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[40:41], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[38:39], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[34:35], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[30:31], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[28:29], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x100000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s23 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s27 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s9 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s27 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s17 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s15 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s13 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, s11 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64: @@ -8465,98 +8502,108 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: ; implicit-def: $sgpr24_sgpr25 +; GCN-HSA-NEXT: ; implicit-def: $sgpr34_sgpr35 +; GCN-HSA-NEXT: ; implicit-def: $sgpr36_sgpr37 +; GCN-HSA-NEXT: ; implicit-def: $sgpr30_sgpr31 +; GCN-HSA-NEXT: ; implicit-def: $sgpr28_sgpr29 +; GCN-HSA-NEXT: ; implicit-def: $sgpr38_sgpr39 +; GCN-HSA-NEXT: ; implicit-def: $sgpr48_sgpr49 +; GCN-HSA-NEXT: ; implicit-def: $sgpr50_sgpr51 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_mov_b32 s24, s15 -; GCN-HSA-NEXT: s_ashr_i32 s37, s3, 31 -; GCN-HSA-NEXT: s_ashr_i32 s38, s3, 16 -; GCN-HSA-NEXT: s_ashr_i32 s57, s11, 16 -; GCN-HSA-NEXT: s_ashr_i32 s59, s13, 31 -; GCN-HSA-NEXT: s_ashr_i32 s61, s13, 16 -; GCN-HSA-NEXT: s_ashr_i32 s63, s15, 31 -; GCN-HSA-NEXT: s_ashr_i32 s65, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s46, s14, 16 -; GCN-HSA-NEXT: s_mov_b32 s48, s13 -; GCN-HSA-NEXT: s_lshr_b32 s50, s12, 16 -; GCN-HSA-NEXT: s_mov_b32 s52, s11 -; GCN-HSA-NEXT: s_lshr_b32 s34, s10, 16 +; GCN-HSA-NEXT: s_mov_b32 s34, s13 +; GCN-HSA-NEXT: s_mov_b32 s36, s11 ; GCN-HSA-NEXT: s_mov_b32 s30, s9 -; GCN-HSA-NEXT: s_lshr_b32 s28, s8, 16 -; GCN-HSA-NEXT: s_mov_b32 s54, s7 -; GCN-HSA-NEXT: s_lshr_b32 s56, s6, 16 -; GCN-HSA-NEXT: s_mov_b32 s58, s5 -; GCN-HSA-NEXT: s_lshr_b32 s60, s4, 16 -; GCN-HSA-NEXT: s_mov_b32 s62, s3 +; GCN-HSA-NEXT: s_mov_b32 s28, s7 +; GCN-HSA-NEXT: s_mov_b32 s38, s5 +; GCN-HSA-NEXT: s_mov_b32 s48, s3 +; GCN-HSA-NEXT: s_mov_b32 s50, s1 +; GCN-HSA-NEXT: s_ashr_i32 s41, s3, 31 +; GCN-HSA-NEXT: s_ashr_i32 s42, s3, 16 +; GCN-HSA-NEXT: s_ashr_i32 s55, s9, 16 +; GCN-HSA-NEXT: s_ashr_i32 s57, s11, 31 +; GCN-HSA-NEXT: s_ashr_i32 s59, s11, 16 +; GCN-HSA-NEXT: s_ashr_i32 s61, s13, 31 +; GCN-HSA-NEXT: s_ashr_i32 s63, s13, 16 +; GCN-HSA-NEXT: s_ashr_i32 s65, s15, 31 +; GCN-HSA-NEXT: s_ashr_i32 s67, s15, 16 +; GCN-HSA-NEXT: s_lshr_b32 s52, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s54, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s56, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s58, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s60, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s62, s4, 16 ; GCN-HSA-NEXT: s_lshr_b32 s64, s2, 16 -; GCN-HSA-NEXT: s_mov_b32 s66, s1 -; GCN-HSA-NEXT: s_lshr_b32 s68, s0, 16 +; GCN-HSA-NEXT: s_lshr_b32 s66, s0, 16 ; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[2:3], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[24:25], 0x100000 ; GCN-HSA-NEXT: s_ashr_i32 s33, s1, 31 -; GCN-HSA-NEXT: s_ashr_i32 s36, s1, 16 -; GCN-HSA-NEXT: s_ashr_i32 s39, s5, 31 -; GCN-HSA-NEXT: s_ashr_i32 s40, s5, 16 -; GCN-HSA-NEXT: s_ashr_i32 s41, s7, 31 -; GCN-HSA-NEXT: s_ashr_i32 s42, s7, 16 -; GCN-HSA-NEXT: s_ashr_i32 s43, s9, 31 -; GCN-HSA-NEXT: s_ashr_i32 s44, s9, 16 -; GCN-HSA-NEXT: s_ashr_i32 s45, s11, 31 +; GCN-HSA-NEXT: s_ashr_i32 s40, s1, 16 +; GCN-HSA-NEXT: s_ashr_i32 s43, s5, 31 +; GCN-HSA-NEXT: s_ashr_i32 s44, s5, 16 +; GCN-HSA-NEXT: s_ashr_i32 s45, s7, 31 +; GCN-HSA-NEXT: s_ashr_i32 s46, s7, 16 +; GCN-HSA-NEXT: s_ashr_i32 s47, s9, 31 ; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[4:5], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[70:71], s[10:11], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[72:73], s[12:13], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[74:75], s[14:15], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[68:69], s[10:11], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[70:71], s[12:13], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[72:73], s[14:15], 0x100000 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[68:69], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[66:67], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[66:67], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[50:51], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[64:65], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[62:63], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[60:61], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[58:59], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[56:57], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[54:55], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[48:49], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[62:63], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[38:39], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[60:61], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[28:29], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[58:59], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[38:39], s[56:57], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[54:55], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 -; GCN-HSA-NEXT: s_add_u32 s54, s16, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s55, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s46 -; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s47 -; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s47 -; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xd0 -; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s47 -; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xc0 -; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s47 -; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xb0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s55 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s65 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s63 -; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 +; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[52:53], 0x100000 +; GCN-HSA-NEXT: s_add_u32 s52, s16, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s53, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s50 +; GCN-HSA-NEXT: s_add_u32 s50, s16, 0xe0 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s51 +; GCN-HSA-NEXT: s_addc_u32 s51, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s34 +; GCN-HSA-NEXT: s_add_u32 s34, s16, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s35 +; GCN-HSA-NEXT: s_addc_u32 s35, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s35 +; GCN-HSA-NEXT: s_add_u32 s34, s16, 0xc0 +; GCN-HSA-NEXT: s_addc_u32 s35, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s35 +; GCN-HSA-NEXT: s_add_u32 s34, s16, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s67 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s65 +; GCN-HSA-NEXT: s_addc_u32 s35, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s74 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s72 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 ; GCN-HSA-NEXT: s_add_u32 s30, s16, 0xa0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s75 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s73 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s51 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 ; GCN-HSA-NEXT: s_addc_u32 s31, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s63 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s26 ; GCN-HSA-NEXT: s_add_u32 s26, s16, 0x90 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s27 @@ -8564,38 +8611,36 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s26 ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s27 ; GCN-HSA-NEXT: s_add_u32 s26, s16, 0x80 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s61 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s59 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s61 ; GCN-HSA-NEXT: s_addc_u32 s27, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s72 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s70 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s24 ; GCN-HSA-NEXT: s_add_u32 s24, s16, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s73 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s71 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s49 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25 ; GCN-HSA-NEXT: s_addc_u32 s25, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s34 ; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s14 ; GCN-HSA-NEXT: s_add_u32 s14, s16, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s59 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s57 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s15 ; GCN-HSA-NEXT: s_addc_u32 s15, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s70 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s71 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s68 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s69 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s39 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s47 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s26 ; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v27, s27 @@ -8605,8 +8650,8 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s29 ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s45 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s23 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s25 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s14 @@ -8619,8 +8664,8 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s43 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s13 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 @@ -8639,8 +8684,8 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCN-HSA-NEXT: s_addc_u32 s9, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s41 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 @@ -8659,7 +8704,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s40 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -8675,151 +8720,158 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-LABEL: constant_sextload_v32i16_to_v32i64: ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 +; GCN-NOHSA-VI-NEXT: ; implicit-def: $sgpr24_sgpr25 +; GCN-NOHSA-VI-NEXT: ; implicit-def: $sgpr26_sgpr27 +; GCN-NOHSA-VI-NEXT: ; implicit-def: $sgpr42_sgpr43 +; GCN-NOHSA-VI-NEXT: ; implicit-def: $sgpr50_sgpr51 +; GCN-NOHSA-VI-NEXT: ; implicit-def: $sgpr56_sgpr57 +; GCN-NOHSA-VI-NEXT: ; implicit-def: $sgpr62_sgpr63 +; GCN-NOHSA-VI-NEXT: ; implicit-def: $sgpr68_sgpr69 +; GCN-NOHSA-VI-NEXT: ; implicit-def: $sgpr70_sgpr71 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s0, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s26, s1 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s1, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s2, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s34, s3 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s3, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s0, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s24, s1 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s1, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s2, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s26, s3 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s3, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s4, 16 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s42, s5 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s5, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s6, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s48, s7 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s7, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s8, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s54, s9 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s9, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s10, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s60, s11 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s11, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s6, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s50, s7 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s7, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s8, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s56, s9 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s9, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s10, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s62, s11 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s11, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s12, 16 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s68, s13 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s70, s13, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s74, s14, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s76, s15 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s70, s15 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s72, s13, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s76, s14, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s78, s15, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[6:7], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[8:9], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[10:11], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[64:65], s[12:13], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[72:73], s[14:15], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[22:23], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[26:27], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[30:31], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[34:35], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[36:37], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[40:41], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[8:9], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[10:11], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[12:13], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[74:75], s[14:15], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[28:29], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[24:25], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[36:37], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[26:27], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[38:39], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[40:41], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[42:43], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[44:45], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[46:47], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[48:49], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[50:51], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[52:53], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[54:55], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[56:57], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[58:59], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[48:49], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[50:51], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[52:53], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[54:55], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[56:57], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[58:59], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[60:61], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[62:63], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[66:67], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[68:69], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[70:71], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[74:75], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[64:65], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[66:67], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[68:69], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[72:73], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[76:77], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[62:63], s[78:79], 0x100000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s60 -; GCN-NOHSA-VI-NEXT: s_add_u32 s60, s16, 0xf0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s61 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[62:63], s[70:71], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[64:65], s[78:79], 0x100000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s62 +; GCN-NOHSA-VI-NEXT: s_add_u32 s62, s16, 0xf0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s63 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s63, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s62 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s63 +; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s60 +; GCN-NOHSA-VI-NEXT: s_add_u32 s60, s16, 0xe0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s61, s17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s60 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s62 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s74 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s75 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s61 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s56 +; GCN-NOHSA-VI-NEXT: s_add_u32 s56, s16, 0xd0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s57 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s57, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s56 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s58 -; GCN-NOHSA-VI-NEXT: s_add_u32 s58, s16, 0xe0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s59, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s58 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s72 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s73 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s59 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s57 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s54 -; GCN-NOHSA-VI-NEXT: s_add_u32 s54, s16, 0xd0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s55 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s55, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s54 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s46 +; GCN-NOHSA-VI-NEXT: s_add_u32 s46, s16, 0xc0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s47 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s47, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s46 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s47 +; GCN-NOHSA-VI-NEXT: s_add_u32 s46, s16, 0xb0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s47, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s46 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s52 -; GCN-NOHSA-VI-NEXT: s_add_u32 s52, s16, 0xc0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s53, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s52 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s64 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s65 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s53 -; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s48 -; GCN-NOHSA-VI-NEXT: s_add_u32 s48, s16, 0xb0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s49, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s50 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s49 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s47 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s38 -; GCN-NOHSA-VI-NEXT: s_add_u32 s38, s16, 0xa0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s39, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s38 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s46 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s39 -; GCN-NOHSA-VI-NEXT: s_add_u32 s38, s16, 0x90 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34 +; GCN-NOHSA-VI-NEXT: s_add_u32 s34, s16, 0xa0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s35, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s35 +; GCN-NOHSA-VI-NEXT: s_add_u32 s34, s16, 0x90 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s39, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s38 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s35, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s44 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s45 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s42 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s39 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s35 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 -; GCN-NOHSA-VI-NEXT: s_add_u32 s24, s16, 0x80 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s25, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-VI-NEXT: s_add_u32 s22, s16, 0x80 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s23, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s40 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s25 -; GCN-NOHSA-VI-NEXT: s_add_u32 s24, s16, 0x70 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s23 +; GCN-NOHSA-VI-NEXT: s_add_u32 s22, s16, 0x70 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s25, s17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s24 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s25 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s23, s17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s36 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s37 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s23 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 @@ -8848,8 +8900,8 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s16, 48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s22 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s25 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s17, 0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3 @@ -9056,92 +9108,100 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GFX12-LABEL: constant_sextload_v32i16_to_v32i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[16:19], s[4:5], 0x24 +; GFX12-NEXT: ; implicit-def: $sgpr22_sgpr23 +; GFX12-NEXT: ; implicit-def: $sgpr24_sgpr25 +; GFX12-NEXT: ; implicit-def: $sgpr28_sgpr29 +; GFX12-NEXT: ; implicit-def: $sgpr38_sgpr39 +; GFX12-NEXT: ; implicit-def: $sgpr66_sgpr67 +; GFX12-NEXT: ; implicit-def: $sgpr48_sgpr49 +; GFX12-NEXT: ; implicit-def: $sgpr46_sgpr47 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshr_b32 s28, s2, 16 -; GFX12-NEXT: s_lshr_b32 s42, s5, 16 -; GFX12-NEXT: s_lshr_b32 s50, s8, 16 -; GFX12-NEXT: s_mov_b32 s60, s11 -; GFX12-NEXT: s_lshr_b32 s22, s0, 16 -; GFX12-NEXT: s_mov_b32 s24, s1 -; GFX12-NEXT: s_lshr_b32 s26, s1, 16 -; GFX12-NEXT: s_mov_b32 s30, s3 -; GFX12-NEXT: s_lshr_b32 s36, s3, 16 -; GFX12-NEXT: s_lshr_b32 s38, s4, 16 -; GFX12-NEXT: s_mov_b32 s40, s5 -; GFX12-NEXT: s_lshr_b32 s44, s6, 16 -; GFX12-NEXT: s_mov_b32 s46, s7 -; GFX12-NEXT: s_lshr_b32 s48, s7, 16 -; GFX12-NEXT: s_mov_b32 s52, s9 -; GFX12-NEXT: s_lshr_b32 s54, s9, 16 -; GFX12-NEXT: s_bfe_i64 s[56:57], s[10:11], 0x100000 -; GFX12-NEXT: s_lshr_b32 s58, s10, 16 -; GFX12-NEXT: s_lshr_b32 s62, s11, 16 -; GFX12-NEXT: s_bfe_i64 s[10:11], s[28:29], 0x100000 -; GFX12-NEXT: s_bfe_i64 s[28:29], s[42:43], 0x100000 -; GFX12-NEXT: s_bfe_i64 s[42:43], s[50:51], 0x100000 -; GFX12-NEXT: s_bfe_i64 s[50:51], s[60:61], 0x100000 -; GFX12-NEXT: s_lshr_b32 s60, s14, 16 -; GFX12-NEXT: s_bfe_i64 s[64:65], s[14:15], 0x100000 -; GFX12-NEXT: s_mov_b32 s14, s15 -; GFX12-NEXT: s_lshr_b32 s66, s15, 16 +; GFX12-NEXT: s_lshr_b32 s30, s0, 16 +; GFX12-NEXT: s_mov_b32 s22, s1 +; GFX12-NEXT: s_mov_b32 s24, s3 +; GFX12-NEXT: s_lshr_b32 s50, s4, 16 +; GFX12-NEXT: s_mov_b32 s28, s5 +; GFX12-NEXT: s_lshr_b32 s52, s5, 16 +; GFX12-NEXT: s_lshr_b32 s60, s9, 16 +; GFX12-NEXT: s_lshr_b32 s62, s10, 16 +; GFX12-NEXT: s_lshr_b32 s42, s3, 16 +; GFX12-NEXT: s_lshr_b32 s58, s8, 16 ; GFX12-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 +; GFX12-NEXT: s_lshr_b32 s34, s1, 16 ; GFX12-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x100000 -; GFX12-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x100000 -; GFX12-NEXT: s_bfe_i64 s[20:21], s[6:7], 0x100000 -; GFX12-NEXT: s_bfe_i64 s[34:35], s[8:9], 0x100000 -; GFX12-NEXT: s_bfe_i64 s[4:5], s[22:23], 0x100000 -; GFX12-NEXT: s_bfe_i64 s[8:9], s[24:25], 0x100000 -; GFX12-NEXT: s_bfe_i64 s[6:7], s[26:27], 0x100000 -; GFX12-NEXT: s_bfe_i64 s[24:25], s[30:31], 0x100000 -; GFX12-NEXT: s_bfe_i64 s[22:23], s[36:37], 0x100000 -; GFX12-NEXT: s_bfe_i64 s[26:27], s[38:39], 0x100000 -; GFX12-NEXT: s_bfe_i64 s[30:31], s[40:41], 0x100000 -; GFX12-NEXT: s_bfe_i64 s[36:37], s[44:45], 0x100000 -; GFX12-NEXT: s_bfe_i64 s[40:41], s[46:47], 0x100000 -; GFX12-NEXT: s_bfe_i64 s[38:39], s[48:49], 0x100000 -; GFX12-NEXT: s_bfe_i64 s[44:45], s[52:53], 0x100000 -; GFX12-NEXT: s_bfe_i64 s[46:47], s[54:55], 0x100000 -; GFX12-NEXT: s_bfe_i64 s[48:49], s[58:59], 0x100000 -; GFX12-NEXT: s_lshr_b32 s52, s12, 16 -; GFX12-NEXT: s_bfe_i64 s[54:55], s[12:13], 0x100000 -; GFX12-NEXT: s_mov_b32 s12, s13 -; GFX12-NEXT: s_lshr_b32 s58, s13, 16 -; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 +; GFX12-NEXT: s_lshr_b32 s40, s2, 16 +; GFX12-NEXT: s_bfe_i64 s[26:27], s[6:7], 0x100000 +; GFX12-NEXT: s_lshr_b32 s54, s6, 16 +; GFX12-NEXT: s_mov_b32 s38, s7 +; GFX12-NEXT: s_lshr_b32 s56, s7, 16 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[30:31], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[22:23], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[22:23], s[24:25], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[24:25], s[50:51], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[30:31], s[28:29], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[28:29], s[52:53], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[50:51], s[60:61], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[52:53], s[62:63], 0x100000 +; GFX12-NEXT: s_lshr_b32 s60, s14, 16 +; GFX12-NEXT: s_bfe_i64 s[62:63], s[14:15], 0x100000 +; GFX12-NEXT: s_mov_b32 s66, s15 +; GFX12-NEXT: s_lshr_b32 s14, s15, 16 +; GFX12-NEXT: s_bfe_i64 s[44:45], s[10:11], 0x100000 +; GFX12-NEXT: s_mov_b32 s48, s11 +; GFX12-NEXT: s_lshr_b32 s64, s11, 16 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[42:43], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[42:43], s[58:59], 0x100000 +; GFX12-NEXT: ; implicit-def: $sgpr58_sgpr59 +; GFX12-NEXT: s_bfe_i64 s[20:21], s[4:5], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[36:37], s[8:9], 0x100000 +; GFX12-NEXT: s_mov_b32 s46, s9 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[34:35], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[8:9], s[40:41], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[34:35], s[54:55], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[40:41], s[38:39], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[38:39], s[56:57], 0x100000 +; GFX12-NEXT: s_lshr_b32 s54, s12, 16 +; GFX12-NEXT: s_bfe_i64 s[56:57], s[12:13], 0x100000 +; GFX12-NEXT: s_mov_b32 s58, s13 +; GFX12-NEXT: s_lshr_b32 s12, s13, 16 ; GFX12-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 ; GFX12-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x100000 -; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s15 -; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s67 ; GFX12-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x100000 -; GFX12-NEXT: v_dual_mov_b32 v0, s14 :: v_dual_mov_b32 v3, s67 -; GFX12-NEXT: v_dual_mov_b32 v2, s66 :: v_dual_mov_b32 v5, s65 -; GFX12-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 -; GFX12-NEXT: v_dual_mov_b32 v4, s64 :: v_dual_mov_b32 v7, s61 -; GFX12-NEXT: v_dual_mov_b32 v6, s60 :: v_dual_mov_b32 v9, s13 -; GFX12-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v11, s59 -; GFX12-NEXT: v_dual_mov_b32 v10, s58 :: v_dual_mov_b32 v13, s55 -; GFX12-NEXT: v_dual_mov_b32 v12, s54 :: v_dual_mov_b32 v15, s53 -; GFX12-NEXT: v_mov_b32_e32 v14, s52 -; GFX12-NEXT: s_bfe_i64 s[12:13], s[62:63], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 +; GFX12-NEXT: v_dual_mov_b32 v0, s66 :: v_dual_mov_b32 v3, s15 +; GFX12-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v5, s63 +; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000 +; GFX12-NEXT: v_dual_mov_b32 v4, s62 :: v_dual_mov_b32 v7, s61 +; GFX12-NEXT: v_dual_mov_b32 v6, s60 :: v_dual_mov_b32 v9, s59 +; GFX12-NEXT: v_dual_mov_b32 v8, s58 :: v_dual_mov_b32 v11, s13 +; GFX12-NEXT: v_dual_mov_b32 v10, s12 :: v_dual_mov_b32 v13, s57 +; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000 +; GFX12-NEXT: v_dual_mov_b32 v12, s56 :: v_dual_mov_b32 v15, s55 +; GFX12-NEXT: v_mov_b32_e32 v14, s54 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[64:65], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 ; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v24, v[0:3], s[16:17] offset:240 ; GFX12-NEXT: global_store_b128 v24, v[4:7], s[16:17] offset:224 ; GFX12-NEXT: global_store_b128 v24, v[8:11], s[16:17] offset:208 ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[16:17] offset:192 ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v0, s50 :: v_dual_mov_b32 v3, s13 -; GFX12-NEXT: v_dual_mov_b32 v1, s51 :: v_dual_mov_b32 v2, s12 -; GFX12-NEXT: v_dual_mov_b32 v5, s57 :: v_dual_mov_b32 v4, s56 -; GFX12-NEXT: v_dual_mov_b32 v7, s49 :: v_dual_mov_b32 v6, s48 -; GFX12-NEXT: v_dual_mov_b32 v9, s45 :: v_dual_mov_b32 v8, s44 -; GFX12-NEXT: v_dual_mov_b32 v11, s47 :: v_dual_mov_b32 v10, s46 -; GFX12-NEXT: v_dual_mov_b32 v13, s35 :: v_dual_mov_b32 v12, s34 +; GFX12-NEXT: v_dual_mov_b32 v0, s48 :: v_dual_mov_b32 v3, s13 +; GFX12-NEXT: v_dual_mov_b32 v1, s49 :: v_dual_mov_b32 v2, s12 +; GFX12-NEXT: v_dual_mov_b32 v5, s45 :: v_dual_mov_b32 v4, s44 +; GFX12-NEXT: v_dual_mov_b32 v7, s53 :: v_dual_mov_b32 v6, s52 +; GFX12-NEXT: v_dual_mov_b32 v9, s47 :: v_dual_mov_b32 v8, s46 +; GFX12-NEXT: v_dual_mov_b32 v11, s51 :: v_dual_mov_b32 v10, s50 +; GFX12-NEXT: v_dual_mov_b32 v13, s37 :: v_dual_mov_b32 v12, s36 ; GFX12-NEXT: v_dual_mov_b32 v15, s43 :: v_dual_mov_b32 v14, s42 ; GFX12-NEXT: v_dual_mov_b32 v17, s41 :: v_dual_mov_b32 v16, s40 ; GFX12-NEXT: v_dual_mov_b32 v19, s39 :: v_dual_mov_b32 v18, s38 -; GFX12-NEXT: v_dual_mov_b32 v21, s21 :: v_dual_mov_b32 v20, s20 -; GFX12-NEXT: v_dual_mov_b32 v23, s37 :: v_dual_mov_b32 v22, s36 +; GFX12-NEXT: v_dual_mov_b32 v21, s27 :: v_dual_mov_b32 v20, s26 +; GFX12-NEXT: v_dual_mov_b32 v23, s35 :: v_dual_mov_b32 v22, s34 ; GFX12-NEXT: s_clause 0x5 ; GFX12-NEXT: global_store_b128 v24, v[0:3], s[16:17] offset:176 ; GFX12-NEXT: global_store_b128 v24, v[4:7], s[16:17] offset:160 @@ -9151,16 +9211,16 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GFX12-NEXT: global_store_b128 v24, v[20:23], s[16:17] offset:96 ; GFX12-NEXT: v_dual_mov_b32 v0, s30 :: v_dual_mov_b32 v3, s29 ; GFX12-NEXT: v_dual_mov_b32 v1, s31 :: v_dual_mov_b32 v2, s28 -; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 -; GFX12-NEXT: v_dual_mov_b32 v7, s27 :: v_dual_mov_b32 v6, s26 -; GFX12-NEXT: v_dual_mov_b32 v9, s25 :: v_dual_mov_b32 v8, s24 -; GFX12-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22 +; GFX12-NEXT: v_dual_mov_b32 v5, s21 :: v_dual_mov_b32 v4, s20 +; GFX12-NEXT: v_dual_mov_b32 v7, s25 :: v_dual_mov_b32 v6, s24 +; GFX12-NEXT: v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v8, s22 +; GFX12-NEXT: v_dual_mov_b32 v11, s11 :: v_dual_mov_b32 v10, s10 ; GFX12-NEXT: v_dual_mov_b32 v13, s1 :: v_dual_mov_b32 v12, s0 -; GFX12-NEXT: v_dual_mov_b32 v15, s11 :: v_dual_mov_b32 v14, s10 -; GFX12-NEXT: v_dual_mov_b32 v17, s9 :: v_dual_mov_b32 v16, s8 -; GFX12-NEXT: v_dual_mov_b32 v19, s7 :: v_dual_mov_b32 v18, s6 +; GFX12-NEXT: v_dual_mov_b32 v15, s9 :: v_dual_mov_b32 v14, s8 +; GFX12-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6 +; GFX12-NEXT: v_dual_mov_b32 v19, s5 :: v_dual_mov_b32 v18, s4 ; GFX12-NEXT: v_dual_mov_b32 v21, s19 :: v_dual_mov_b32 v20, s18 -; GFX12-NEXT: v_dual_mov_b32 v23, s5 :: v_dual_mov_b32 v22, s4 +; GFX12-NEXT: v_dual_mov_b32 v23, s3 :: v_dual_mov_b32 v22, s2 ; GFX12-NEXT: s_clause 0x5 ; GFX12-NEXT: global_store_b128 v24, v[0:3], s[16:17] offset:80 ; GFX12-NEXT: global_store_b128 v24, v[4:7], s[16:17] offset:64 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index b534c2c267fad..c5213336f818c 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -6396,39 +6396,40 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 +; GFX6-NOHSA-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_lshr_b32 s6, s5, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s8, s4, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s4, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s4, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s5, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s16, s5 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GFX6-NOHSA-NEXT: s_lshr_b32 s8, s5, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s4, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s4, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s5, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s6, s5 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[4:5], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i32 s15, s5, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s5, 31 ; GFX6-NOHSA-NEXT: s_ashr_i32 s20, s5, 24 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[16:17], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s17 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s18 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s19 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s17 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s7 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s11 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s13 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s12 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s15 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s4 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s5 @@ -6439,26 +6440,27 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s4, s3, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s6, s2, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s8, s2, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s10, s2, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s12, s3, 8 -; GFX7-HSA-NEXT: s_mov_b32 s14, s3 -; GFX7-HSA-NEXT: s_ashr_i32 s5, s3, 31 +; GFX7-HSA-NEXT: s_lshr_b32 s6, s3, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s8, s2, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s10, s2, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s12, s2, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s14, s3, 8 +; GFX7-HSA-NEXT: s_mov_b32 s4, s3 +; GFX7-HSA-NEXT: s_ashr_i32 s7, s3, 31 ; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 ; GFX7-HSA-NEXT: s_ashr_i32 s18, s3, 24 -; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[6:7], 0x80000 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6467,14 +6469,14 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -6482,15 +6484,15 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX7-HSA-NEXT: s_add_u32 s0, s0, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s13 ; GFX7-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm @@ -6498,24 +6500,25 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX8-NOHSA-LABEL: constant_sextload_v8i8_to_v8i64: ; GFX8-NOHSA: ; %bb.0: ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NOHSA-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s3, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s2, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s2, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s2, 8 -; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s3, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s14, s3 -; GFX8-NOHSA-NEXT: s_ashr_i32 s5, s3, 31 +; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s3, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s2, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s2, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s2, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s3, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s4, s3 +; GFX8-NOHSA-NEXT: s_ashr_i32 s7, s3, 31 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 ; GFX8-NOHSA-NEXT: s_ashr_i32 s18, s3, 24 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[6:7], 0x80000 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6524,14 +6527,14 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 @@ -6539,15 +6542,15 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s13 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -6611,33 +6614,34 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX12-LABEL: constant_sextload_v8i8_to_v8i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshr_b32 s4, s3, 16 -; GFX12-NEXT: s_lshr_b32 s6, s2, 16 -; GFX12-NEXT: s_lshr_b32 s8, s2, 24 -; GFX12-NEXT: s_lshr_b32 s10, s2, 8 +; GFX12-NEXT: s_lshr_b32 s6, s3, 16 +; GFX12-NEXT: s_lshr_b32 s8, s2, 16 +; GFX12-NEXT: s_lshr_b32 s10, s2, 24 +; GFX12-NEXT: s_lshr_b32 s12, s2, 8 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GFX12-NEXT: s_lshr_b32 s12, s3, 8 -; GFX12-NEXT: s_mov_b32 s14, s3 +; GFX12-NEXT: s_lshr_b32 s14, s3, 8 +; GFX12-NEXT: s_mov_b32 s4, s3 ; GFX12-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 ; GFX12-NEXT: s_ashr_i32 s15, s3, 31 ; GFX12-NEXT: s_ashr_i32 s18, s3, 24 -; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s15 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v9, s7 -; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v11, s9 -; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v7, s11 -; GFX12-NEXT: s_bfe_i64 s[2:3], s[14:15], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s15 +; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v9, s9 +; GFX12-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s11 +; GFX12-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v7, s13 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000 ; GFX12-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v5, s17 -; GFX12-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v13, s3 -; GFX12-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v15, s13 -; GFX12-NEXT: v_mov_b32_e32 v14, s12 +; GFX12-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v1, s7 +; GFX12-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v13, s3 +; GFX12-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v15, s5 +; GFX12-NEXT: v_mov_b32_e32 v14, s4 ; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] @@ -7032,71 +7036,73 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 +; GFX6-NOHSA-NEXT: ; implicit-def: $sgpr10_sgpr11 +; GFX6-NOHSA-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s6, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s6, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s6, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s4, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s4, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s4, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s7, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s7, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s26, s7 +; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s6, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s6, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s6, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s4, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s4, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s7, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s7, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s10, s7 ; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s5, 16 ; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s5, 8 ; GFX6-NOHSA-NEXT: s_mov_b32 s8, s5 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i32 s29, s5, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s31, s5, 24 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s7, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s38, s7, 24 +; GFX6-NOHSA-NEXT: s_ashr_i32 s27, s5, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s29, s5, 24 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[10:11], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i32 s31, s7, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s7, 24 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[26:27], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s36 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s37 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s34 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s35 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s38 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s33 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s26 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s27 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s12 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s33 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s38 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s39 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s17 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s29 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s29 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s27 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s19 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s21 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s23 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s23 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s24 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s25 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s13 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s10 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s11 @@ -7115,44 +7121,46 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 ; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX7-HSA-NEXT: ; implicit-def: $sgpr2_sgpr3 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s2, s6, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s8, s6, 24 -; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 -; GFX7-HSA-NEXT: s_lshr_b32 s10, s6, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s12, s4, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s14, s4, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s16, s4, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s18, s7, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s20, s7, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s10, s6, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s12, s6, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s16, s4, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s18, s4, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s20, s4, 8 +; GFX7-HSA-NEXT: s_mov_b32 s2, s5 ; GFX7-HSA-NEXT: s_ashr_i32 s27, s5, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s29, s5, 24 -; GFX7-HSA-NEXT: s_mov_b32 s22, s7 -; GFX7-HSA-NEXT: s_lshr_b32 s24, s5, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s26, s5, 8 -; GFX7-HSA-NEXT: s_mov_b32 s28, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[8:9], 0x80000 +; GFX7-HSA-NEXT: s_lshr_b32 s26, s5, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s28, s5, 8 +; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[10:11], 0x80000 +; GFX7-HSA-NEXT: s_lshr_b32 s14, s6, 8 +; GFX7-HSA-NEXT: s_mov_b32 s8, s7 +; GFX7-HSA-NEXT: s_lshr_b32 s22, s7, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s24, s7, 8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[12:13], 0x80000 ; GFX7-HSA-NEXT: s_ashr_i32 s33, s7, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s34, s7, 24 -; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[28:29], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[26:27], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[24:25], 0x80000 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[28:29], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[26:27], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[24:25], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX7-HSA-NEXT: s_add_u32 s24, s0, 0x50 ; GFX7-HSA-NEXT: s_addc_u32 s25, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 @@ -7164,30 +7172,30 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s25 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s6 ; GFX7-HSA-NEXT: s_add_u32 s6, s0, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s15 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s17 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x70 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 @@ -7197,16 +7205,16 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: s_add_u32 s6, s0, 48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s13 ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: s_add_u32 s0, s0, 32 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s11 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s29 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s27 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 @@ -7224,110 +7232,111 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX8-NOHSA-LABEL: constant_sextload_v16i8_to_v16i64: ; GFX8-NOHSA: ; %bb.0: ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NOHSA-NEXT: ; implicit-def: $sgpr10_sgpr11 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX8-NOHSA-NEXT: ; implicit-def: $sgpr2_sgpr3 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s10, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s10, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s10, 8 -; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s8, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s8, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s22, s8, 8 -; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s11, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s26, s11, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s28, s11 -; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s9, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s9, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s2, s9 -; GFX8-NOHSA-NEXT: s_ashr_i32 s25, s9, 31 -; GFX8-NOHSA-NEXT: s_ashr_i32 s29, s9, 24 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[30:31], s[8:9], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[34:35], s[10:11], 0x80000 +; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s6, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s6, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s6, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s4, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s22, s4, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s4, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s26, s7, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s7, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s10, s7 +; GFX8-NOHSA-NEXT: s_lshr_b32 s28, s5, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s5, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s2, s5 +; GFX8-NOHSA-NEXT: s_ashr_i32 s27, s5, 31 +; GFX8-NOHSA-NEXT: s_ashr_i32 s29, s5, 24 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[8:9], s[28:29], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i32 s28, s11, 31 -; GFX8-NOHSA-NEXT: s_ashr_i32 s33, s11, 24 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[24:25], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[28:29], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i32 s28, s7, 31 +; GFX8-NOHSA-NEXT: s_ashr_i32 s33, s7, 24 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[26:27], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX8-NOHSA-NEXT: s_add_u32 s12, s0, 0x50 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 -; GFX8-NOHSA-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX8-NOHSA-NEXT: s_add_u32 s12, s0, 64 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX8-NOHSA-NEXT: s_add_u32 s14, s0, 0x50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NOHSA-NEXT: s_addc_u32 s15, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s17 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NOHSA-NEXT: s_add_u32 s14, s0, 64 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 +; GFX8-NOHSA-NEXT: s_addc_u32 s15, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s34 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s35 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s17 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX8-NOHSA-NEXT: s_add_u32 s12, s0, 16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s19 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NOHSA-NEXT: s_add_u32 s14, s0, 16 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 +; GFX8-NOHSA-NEXT: s_addc_u32 s15, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s20 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s21 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s23 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s15 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s30 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s31 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s22 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s23 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s24 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s25 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x70 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11 -; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x70 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s33 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s28 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NOHSA-NEXT: s_add_u32 s8, s0, 0x60 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NOHSA-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s26 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s27 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x60 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 48 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s29 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s25 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s27 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -7434,57 +7443,59 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX12-LABEL: constant_sextload_v16i8_to_v16i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX12-NEXT: ; implicit-def: $sgpr2_sgpr3 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshr_b32 s2, s6, 16 -; GFX12-NEXT: s_lshr_b32 s8, s6, 24 -; GFX12-NEXT: s_lshr_b32 s10, s6, 8 +; GFX12-NEXT: s_lshr_b32 s10, s6, 16 +; GFX12-NEXT: s_lshr_b32 s12, s6, 24 +; GFX12-NEXT: s_lshr_b32 s14, s6, 8 ; GFX12-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 -; GFX12-NEXT: s_lshr_b32 s12, s4, 16 -; GFX12-NEXT: s_lshr_b32 s14, s4, 24 -; GFX12-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s35 -; GFX12-NEXT: s_lshr_b32 s16, s4, 8 -; GFX12-NEXT: v_dual_mov_b32 v4, s30 :: v_dual_mov_b32 v9, s3 -; GFX12-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v11, s9 -; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v3, s11 -; GFX12-NEXT: s_lshr_b32 s18, s7, 16 +; GFX12-NEXT: s_lshr_b32 s16, s4, 16 +; GFX12-NEXT: s_lshr_b32 s18, s4, 24 +; GFX12-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v0, s34 :: v_dual_mov_b32 v5, s31 -; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v13, s13 -; GFX12-NEXT: s_lshr_b32 s20, s7, 8 -; GFX12-NEXT: s_mov_b32 s22, s7 +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s35 +; GFX12-NEXT: s_lshr_b32 s20, s4, 8 +; GFX12-NEXT: v_dual_mov_b32 v4, s30 :: v_dual_mov_b32 v9, s11 +; GFX12-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s13 +; GFX12-NEXT: v_dual_mov_b32 v10, s12 :: v_dual_mov_b32 v3, s15 +; GFX12-NEXT: s_lshr_b32 s22, s7, 16 +; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX12-NEXT: s_lshr_b32 s24, s5, 16 +; GFX12-NEXT: v_dual_mov_b32 v0, s34 :: v_dual_mov_b32 v5, s31 +; GFX12-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v13, s17 +; GFX12-NEXT: s_lshr_b32 s24, s7, 8 +; GFX12-NEXT: s_mov_b32 s2, s7 +; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 +; GFX12-NEXT: s_lshr_b32 s26, s5, 16 +; GFX12-NEXT: s_mov_b32 s8, s5 +; GFX12-NEXT: s_ashr_i32 s27, s5, 31 ; GFX12-NEXT: s_ashr_i32 s33, s7, 31 ; GFX12-NEXT: s_ashr_i32 s36, s7, 24 -; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v15, s15 -; GFX12-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v7, s17 -; GFX12-NEXT: s_lshr_b32 s26, s5, 8 -; GFX12-NEXT: s_mov_b32 s28, s5 -; GFX12-NEXT: s_ashr_i32 s27, s5, 31 -; GFX12-NEXT: s_ashr_i32 s29, s5, 24 ; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX12-NEXT: v_mov_b32_e32 v6, s16 +; GFX12-NEXT: v_dual_mov_b32 v12, s16 :: v_dual_mov_b32 v15, s19 +; GFX12-NEXT: v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v7, s21 +; GFX12-NEXT: s_lshr_b32 s28, s5, 8 +; GFX12-NEXT: s_ashr_i32 s29, s5, 24 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GFX12-NEXT: v_mov_b32_e32 v6, s20 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[8:9], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[8:9], s[26:27], 0x80000 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:80 ; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:64 -; GFX12-NEXT: v_dual_mov_b32 v0, s18 :: v_dual_mov_b32 v3, s33 -; GFX12-NEXT: v_dual_mov_b32 v1, s19 :: v_dual_mov_b32 v2, s36 -; GFX12-NEXT: v_mov_b32_e32 v9, s23 -; GFX12-NEXT: s_bfe_i64 s[4:5], s[28:29], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[6:7], s[26:27], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v11, s21 -; GFX12-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v17, s25 -; GFX12-NEXT: v_dual_mov_b32 v16, s24 :: v_dual_mov_b32 v19, s27 +; GFX12-NEXT: v_dual_mov_b32 v0, s22 :: v_dual_mov_b32 v3, s33 +; GFX12-NEXT: v_dual_mov_b32 v1, s23 :: v_dual_mov_b32 v2, s36 +; GFX12-NEXT: v_mov_b32_e32 v9, s3 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v11, s25 +; GFX12-NEXT: v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v17, s9 +; GFX12-NEXT: v_dual_mov_b32 v16, s8 :: v_dual_mov_b32 v19, s27 ; GFX12-NEXT: v_dual_mov_b32 v18, s29 :: v_dual_mov_b32 v21, s5 ; GFX12-NEXT: v_dual_mov_b32 v20, s4 :: v_dual_mov_b32 v23, s7 ; GFX12-NEXT: v_mov_b32_e32 v22, s6 @@ -8205,304 +8216,314 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GFX6-NOHSA-NEXT: ; implicit-def: $sgpr26_sgpr27 +; GFX6-NOHSA-NEXT: ; implicit-def: $sgpr12_sgpr13 +; GFX6-NOHSA-NEXT: ; implicit-def: $sgpr14_sgpr15 +; GFX6-NOHSA-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s6, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s6, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s6, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s4, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s4, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s4, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s2, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s2, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s2, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s0, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s0, 24 -; GFX6-NOHSA-NEXT: s_mov_b32 s34, s7 -; GFX6-NOHSA-NEXT: s_ashr_i32 s11, s1, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s13, s1, 24 -; GFX6-NOHSA-NEXT: s_ashr_i32 s15, s3, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s3, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s6, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s6, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s6, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s4, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s4, 24 +; GFX6-NOHSA-NEXT: s_mov_b32 s26, s7 +; GFX6-NOHSA-NEXT: s_mov_b32 s12, s5 +; GFX6-NOHSA-NEXT: s_mov_b32 s14, s3 +; GFX6-NOHSA-NEXT: s_mov_b32 s18, s1 +; GFX6-NOHSA-NEXT: s_ashr_i32 s23, s1, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s25, s1, 24 +; GFX6-NOHSA-NEXT: s_ashr_i32 s29, s3, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s31, s3, 24 ; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s5, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s49, s5, 24 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[34:35], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i32 s19, s7, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s7, 24 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[52:53], s[30:31], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[54:55], s[28:29], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i32 s35, s5, 24 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i32 s11, s7, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s7, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s38, s4, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s2, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s2, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s2, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s0, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s0, 24 ; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s0, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s7, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s7, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s5, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s5, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s46, s5 -; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s3, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s44, s3, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s50, s3 -; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s1, 16 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[56:57], s[0:1], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[56:57], s[24:25], 0x80000 +; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s7, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s7, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s5, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s44, s5, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s50, s3, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s52, s3, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s54, s1, 16 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[58:59], s[0:1], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[58:59], s[4:5], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[60:61], s[6:7], 0x80000 -; GFX6-NOHSA-NEXT: s_lshr_b32 s4, s1, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s6, s1 -; GFX6-NOHSA-NEXT: s_mov_b32 s0, s8 -; GFX6-NOHSA-NEXT: s_mov_b32 s1, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s60 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s61 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s58 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s59 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s2 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s3 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s56 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s57 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s38 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s39 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s54 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s55 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s52 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s53 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s21 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s19 -; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[26:27], 0x80000 -; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s8 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:208 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s9 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s23 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s25 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:144 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[6:7], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[50:51], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[46:47], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[48:49], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[44:45], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[42:43], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[40:41], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 +; GFX6-NOHSA-NEXT: s_lshr_b32 s0, s1, 8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s58 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s59 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s27 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s11 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s56 +; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[2:3], s[10:11], 0x80000 +; GFX6-NOHSA-NEXT: s_mov_b32 s10, -1 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s57 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s2 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s3 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[8:11], 0 offset:208 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[18:19], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[14:15], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[12:13], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[56:57], s[22:23], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[58:59], s[16:17], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[60:61], s[20:21], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[16:17], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[42:43], s[14:15], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[44:45], s[12:13], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[46:47], s[10:11], 0x80000 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:128 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s46 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s47 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s44 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s45 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[2:3], s[54:55], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[52:53], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[50:51], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[44:45], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[34:35], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[30:31], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[44:45], s[28:29], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[50:51], s[24:25], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s58 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s59 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:192 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s49 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s33 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s42 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s43 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s40 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s41 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s19 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s56 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s57 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s60 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s61 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:144 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s35 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s33 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s25 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s28 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s29 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s17 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s15 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s30 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s31 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:240 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s23 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s34 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s35 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:224 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s11 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s36 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s37 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s38 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s39 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[8:11], 0 offset:128 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s21 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s38 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s39 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:160 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s49 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s36 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s37 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[8:11], 0 offset:80 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s26 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s27 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s9 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:96 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s27 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s46 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s47 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:64 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s5 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s42 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s43 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s40 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s41 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s29 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s44 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s45 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[8:11], 0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s19 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s50 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s51 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[8:11], 0 offset:240 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s25 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s23 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:224 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s17 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[8:11], 0 offset:176 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s15 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:160 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s13 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:112 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[8:11], 0 offset:96 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s2 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s3 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[8:11], 0 offset:48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s1 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 ; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 -; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: ; implicit-def: $sgpr12_sgpr13 +; GFX7-HSA-NEXT: ; implicit-def: $sgpr22_sgpr23 +; GFX7-HSA-NEXT: ; implicit-def: $sgpr14_sgpr15 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GFX7-HSA-NEXT: ; implicit-def: $sgpr10_sgpr11 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s12, s6, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s10, s6, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s16, s6, 16 +; GFX7-HSA-NEXT: s_mov_b32 s12, s1 ; GFX7-HSA-NEXT: s_ashr_i32 s33, s1, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s37, s1, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s34, s0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s30, s0, 24 +; GFX7-HSA-NEXT: s_ashr_i32 s42, s1, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s20, s6, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s36, s0, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s34, s0, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s28, s0, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s64, s1, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s66, s1, 8 -; GFX7-HSA-NEXT: s_mov_b32 s68, s1 -; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[0:1], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[12:13], 0x80000 -; GFX7-HSA-NEXT: s_lshr_b32 s36, s6, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s40, s4, 16 -; GFX7-HSA-NEXT: s_ashr_i32 s41, s3, 31 -; GFX7-HSA-NEXT: s_lshr_b32 s50, s4, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s52, s4, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s54, s2, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s56, s2, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s42, s2, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s70, s1, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s72, s1, 8 +; GFX7-HSA-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[16:17], 0x80000 +; GFX7-HSA-NEXT: s_mov_b32 s22, s7 +; GFX7-HSA-NEXT: s_mov_b32 s14, s5 +; GFX7-HSA-NEXT: s_mov_b32 s10, s3 +; GFX7-HSA-NEXT: s_lshr_b32 s50, s6, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s52, s4, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s54, s4, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s56, s4, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s58, s2, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s60, s2, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s40, s2, 8 ; GFX7-HSA-NEXT: s_lshr_b32 s26, s7, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s20, s7, 8 -; GFX7-HSA-NEXT: s_mov_b32 s24, s7 -; GFX7-HSA-NEXT: s_lshr_b32 s18, s5, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s14, s5, 8 -; GFX7-HSA-NEXT: s_mov_b32 s16, s5 -; GFX7-HSA-NEXT: s_lshr_b32 s58, s3, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s60, s3, 8 -; GFX7-HSA-NEXT: s_mov_b32 s62, s3 +; GFX7-HSA-NEXT: s_lshr_b32 s62, s7, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s24, s5, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s64, s5, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s66, s3, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s68, s3, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[10:11], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[20:21], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i32 s43, s3, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s44, s3, 24 ; GFX7-HSA-NEXT: s_ashr_i32 s45, s5, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s46, s5, 24 ; GFX7-HSA-NEXT: s_ashr_i32 s47, s7, 31 ; GFX7-HSA-NEXT: s_ashr_i32 s48, s7, 24 -; GFX7-HSA-NEXT: s_bfe_i64 s[38:39], s[2:3], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[70:71], s[4:5], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[72:73], s[6:7], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[2:3], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[38:39], s[4:5], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[74:75], s[6:7], 0x80000 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[68:69], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[66:67], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[64:65], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[62:63], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[60:61], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[58:59], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[12:13], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[72:73], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[70:71], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[68:69], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[66:67], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[14:15], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[64:65], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[20:21], s[24:25], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[22:23], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[62:63], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[58:59], s[40:41], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[60:61], s[36:37], 0x80000 ; GFX7-HSA-NEXT: s_add_u32 s62, s8, 0xd0 ; GFX7-HSA-NEXT: s_addc_u32 s63, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s60 -; GFX7-HSA-NEXT: s_add_u32 s60, s8, 0xc0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s61 -; GFX7-HSA-NEXT: s_addc_u32 s61, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s50 -; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0x90 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s50 +; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0xc0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s51 ; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s51 -; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0x80 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s51 +; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0x90 ; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s38 +; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0x80 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s39 +; GFX7-HSA-NEXT: s_addc_u32 s39, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s30 +; GFX7-HSA-NEXT: s_add_u32 s30, s8, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s31 +; GFX7-HSA-NEXT: s_addc_u32 s31, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s30 ; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s62 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s38 -; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s31 +; GFX7-HSA-NEXT: s_add_u32 s30, s8, 64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s63 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s39 -; GFX7-HSA-NEXT: s_addc_u32 s39, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s74 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s75 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s50 +; GFX7-HSA-NEXT: s_addc_u32 s31, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s53 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s54 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s55 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s58 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s59 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s61 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s72 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 -; GFX7-HSA-NEXT: s_add_u32 s34, s8, 64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s73 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s61 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s35 -; GFX7-HSA-NEXT: s_addc_u32 s35, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s58 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s59 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s54 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s55 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s56 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s57 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s39 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s22 -; GFX7-HSA-NEXT: s_add_u32 s22, s8, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s23 -; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s18 +; GFX7-HSA-NEXT: s_add_u32 s18, s8, 16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s19 +; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s23 -; GFX7-HSA-NEXT: s_add_u32 s22, s8, 0xf0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s30 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s31 -; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s43 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s23 -; GFX7-HSA-NEXT: s_add_u32 s22, s8, 0xe0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s19 +; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0xf0 +; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s19 +; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0xe0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s31 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s38 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[20:23] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s70 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s19 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s71 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s52 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s53 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s56 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s57 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s39 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s28 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s29 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[20:23] -; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0xb0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s27 @@ -8510,17 +8531,15 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s47 ; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s24 ; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s21 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s23 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0xb0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s46 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s45 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s19 @@ -8542,7 +8561,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_addc_u32 s13, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s44 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s43 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 @@ -8561,7 +8580,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-HSA-NEXT: s_addc_u32 s5, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s42 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -8580,95 +8599,101 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX8-NOHSA: ; %bb.0: ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX8-NOHSA-NEXT: ; implicit-def: $sgpr24_sgpr25 +; GFX8-NOHSA-NEXT: ; implicit-def: $sgpr18_sgpr19 +; GFX8-NOHSA-NEXT: ; implicit-def: $sgpr14_sgpr15 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GFX8-NOHSA-NEXT: ; implicit-def: $sgpr10_sgpr11 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s46, s6, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s48, s6, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s50, s6, 8 -; GFX8-NOHSA-NEXT: s_lshr_b32 s52, s4, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s54, s4, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s56, s4, 8 -; GFX8-NOHSA-NEXT: s_lshr_b32 s58, s2, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s60, s2, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s40, s2, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s40, s6, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s42, s6, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s46, s6, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s48, s4, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s52, s4, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s54, s4, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s56, s2, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s58, s2, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s44, s2, 8 ; GFX8-NOHSA-NEXT: s_lshr_b32 s36, s0, 16 ; GFX8-NOHSA-NEXT: s_lshr_b32 s34, s0, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s28, s0, 8 -; GFX8-NOHSA-NEXT: s_lshr_b32 s62, s7, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s7, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s22, s7 -; GFX8-NOHSA-NEXT: s_lshr_b32 s64, s5, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s30, s0, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s60, s7, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s26, s7, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s24, s7 +; GFX8-NOHSA-NEXT: s_lshr_b32 s62, s5, 16 ; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s5, 8 ; GFX8-NOHSA-NEXT: s_mov_b32 s18, s5 -; GFX8-NOHSA-NEXT: s_lshr_b32 s66, s3, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s50, s3, 16 ; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s3, 8 ; GFX8-NOHSA-NEXT: s_mov_b32 s14, s3 -; GFX8-NOHSA-NEXT: s_lshr_b32 s44, s1, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s64, s1, 16 ; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s1, 8 ; GFX8-NOHSA-NEXT: s_mov_b32 s10, s1 -; GFX8-NOHSA-NEXT: s_ashr_i32 s63, s5, 24 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[26:27], s[0:1], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[30:31], s[2:3], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i32 s41, s1, 24 +; GFX8-NOHSA-NEXT: s_ashr_i32 s43, s3, 31 +; GFX8-NOHSA-NEXT: s_ashr_i32 s47, s3, 24 +; GFX8-NOHSA-NEXT: s_ashr_i32 s49, s5, 31 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[22:23], s[0:1], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[28:29], s[2:3], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[38:39], s[4:5], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[68:69], s[6:7], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[66:67], s[6:7], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX8-NOHSA-NEXT: s_ashr_i32 s33, s1, 31 -; GFX8-NOHSA-NEXT: s_ashr_i32 s42, s1, 24 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[0:1], s[44:45], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[0:1], s[64:65], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i32 s43, s3, 31 -; GFX8-NOHSA-NEXT: s_ashr_i32 s44, s3, 24 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[66:67], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[50:51], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i32 s45, s5, 31 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[64:65], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i32 s50, s5, 24 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[62:63], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i32 s64, s7, 31 -; GFX8-NOHSA-NEXT: s_ashr_i32 s65, s7, 24 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[62:63], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i32 s51, s7, 31 +; GFX8-NOHSA-NEXT: s_ashr_i32 s70, s7, 24 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[60:61], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s46 -; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xd0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s47 -; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s49 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47 -; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xc0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[60:61], s[48:49], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[62:63], s[46:47], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[64:65], s[42:43], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[68:69], s[40:41], 0x80000 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s64 +; GFX8-NOHSA-NEXT: s_add_u32 s64, s8, 0xd0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s65 +; GFX8-NOHSA-NEXT: s_addc_u32 s65, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s64 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s68 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s69 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s50 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s51 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47 -; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0x90 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s65 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s52 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s53 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s54 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s55 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47 +; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s62 +; GFX8-NOHSA-NEXT: s_add_u32 s62, s8, 0xc0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s63 +; GFX8-NOHSA-NEXT: s_addc_u32 s63, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s62 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s66 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s67 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s63 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s52 +; GFX8-NOHSA-NEXT: s_add_u32 s52, s8, 0x90 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s53 +; GFX8-NOHSA-NEXT: s_addc_u32 s53, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s52 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s60 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s61 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s53 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s38 @@ -8676,43 +8701,43 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s39 ; GFX8-NOHSA-NEXT: s_addc_u32 s39, s9, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s38 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s56 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s57 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s54 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s55 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s39 ; GFX8-NOHSA-NEXT: s_add_u32 s38, s8, 0x50 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s39, s9, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s38 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s58 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s59 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s60 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s61 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s56 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s57 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s58 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s59 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s39 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s30 -; GFX8-NOHSA-NEXT: s_add_u32 s30, s8, 64 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s31 -; GFX8-NOHSA-NEXT: s_addc_u32 s31, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s30 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s40 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s41 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s31 -; GFX8-NOHSA-NEXT: s_add_u32 s30, s8, 16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s28 +; GFX8-NOHSA-NEXT: s_add_u32 s28, s8, 64 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s29 +; GFX8-NOHSA-NEXT: s_addc_u32 s29, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s28 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s44 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s45 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s29 +; GFX8-NOHSA-NEXT: s_add_u32 s28, s8, 16 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s31, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s30 +; GFX8-NOHSA-NEXT: s_addc_u32 s29, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s28 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s36 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s37 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s31 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s29 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s26 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s27 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s28 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s29 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s22 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s30 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s31 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -8721,17 +8746,17 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s9, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s65 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s64 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s70 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s51 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NOHSA-NEXT: s_add_u32 s6, s8, 0xe0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s9, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s22 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s25 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s24 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s25 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s26 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s27 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -8740,8 +8765,8 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s63 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s45 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s49 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 0xa0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -8761,7 +8786,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 0x60 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s44 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s47 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s43 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -8780,7 +8805,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s42 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s41 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s33 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -8985,88 +9010,95 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX12-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX12-NEXT: ; implicit-def: $sgpr18_sgpr19 +; GFX12-NEXT: ; implicit-def: $sgpr12_sgpr13 +; GFX12-NEXT: ; implicit-def: $sgpr24_sgpr25 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 +; GFX12-NEXT: ; implicit-def: $sgpr10_sgpr11 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshr_b32 s34, s6, 16 -; GFX12-NEXT: s_lshr_b32 s36, s6, 24 -; GFX12-NEXT: s_lshr_b32 s38, s6, 8 -; GFX12-NEXT: s_lshr_b32 s40, s4, 16 -; GFX12-NEXT: s_lshr_b32 s42, s4, 24 -; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 +; GFX12-NEXT: s_lshr_b32 s36, s6, 16 +; GFX12-NEXT: s_lshr_b32 s38, s6, 24 +; GFX12-NEXT: s_lshr_b32 s40, s6, 8 +; GFX12-NEXT: s_lshr_b32 s42, s4, 16 +; GFX12-NEXT: s_lshr_b32 s44, s4, 24 ; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 -; GFX12-NEXT: s_lshr_b32 s44, s4, 8 -; GFX12-NEXT: s_bfe_i64 s[66:67], s[6:7], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s35 -; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 +; GFX12-NEXT: s_lshr_b32 s46, s4, 8 +; GFX12-NEXT: s_bfe_i64 s[66:67], s[6:7], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v0, s34 :: v_dual_mov_b32 v3, s37 -; GFX12-NEXT: v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v5, s67 -; GFX12-NEXT: s_lshr_b32 s28, s2, 16 -; GFX12-NEXT: s_lshr_b32 s46, s2, 24 -; GFX12-NEXT: s_bfe_i64 s[64:65], s[4:5], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s37 ; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v4, s66 :: v_dual_mov_b32 v7, s39 -; GFX12-NEXT: v_dual_mov_b32 v6, s38 :: v_dual_mov_b32 v9, s41 -; GFX12-NEXT: s_lshr_b32 s48, s2, 8 -; GFX12-NEXT: v_dual_mov_b32 v8, s40 :: v_dual_mov_b32 v11, s43 -; GFX12-NEXT: v_dual_mov_b32 v10, s42 :: v_dual_mov_b32 v13, s65 -; GFX12-NEXT: s_lshr_b32 s50, s0, 16 -; GFX12-NEXT: s_lshr_b32 s52, s0, 24 +; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v3, s39 +; GFX12-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v5, s67 +; GFX12-NEXT: s_lshr_b32 s30, s2, 16 +; GFX12-NEXT: s_lshr_b32 s48, s2, 24 +; GFX12-NEXT: s_bfe_i64 s[64:65], s[4:5], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v12, s64 :: v_dual_mov_b32 v15, s45 -; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GFX12-NEXT: v_mov_b32_e32 v14, s44 -; GFX12-NEXT: s_lshr_b32 s54, s0, 8 -; GFX12-NEXT: s_bfe_i64 s[30:31], s[2:3], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v4, s66 :: v_dual_mov_b32 v7, s41 +; GFX12-NEXT: v_dual_mov_b32 v6, s40 :: v_dual_mov_b32 v9, s43 +; GFX12-NEXT: s_lshr_b32 s50, s2, 8 +; GFX12-NEXT: v_dual_mov_b32 v8, s42 :: v_dual_mov_b32 v11, s45 +; GFX12-NEXT: v_dual_mov_b32 v10, s44 :: v_dual_mov_b32 v13, s65 +; GFX12-NEXT: s_lshr_b32 s52, s0, 16 +; GFX12-NEXT: s_lshr_b32 s54, s0, 24 ; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v12, s64 :: v_dual_mov_b32 v15, s47 +; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 +; GFX12-NEXT: v_mov_b32_e32 v14, s46 +; GFX12-NEXT: s_lshr_b32 s34, s0, 8 +; GFX12-NEXT: s_bfe_i64 s[28:29], s[2:3], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 ; GFX12-NEXT: s_lshr_b32 s56, s7, 16 ; GFX12-NEXT: s_lshr_b32 s58, s5, 16 -; GFX12-NEXT: s_lshr_b32 s60, s1, 8 -; GFX12-NEXT: s_mov_b32 s62, s1 +; GFX12-NEXT: s_lshr_b32 s60, s5, 8 +; GFX12-NEXT: s_mov_b32 s18, s5 +; GFX12-NEXT: s_lshr_b32 s20, s3, 8 +; GFX12-NEXT: s_mov_b32 s12, s3 +; GFX12-NEXT: s_lshr_b32 s14, s1, 16 +; GFX12-NEXT: s_lshr_b32 s62, s1, 8 +; GFX12-NEXT: s_mov_b32 s10, s1 ; GFX12-NEXT: s_ashr_i32 s57, s1, 24 ; GFX12-NEXT: s_ashr_i32 s59, s3, 31 ; GFX12-NEXT: s_ashr_i32 s61, s3, 24 ; GFX12-NEXT: s_ashr_i32 s63, s5, 31 +; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 ; GFX12-NEXT: s_clause 0x3 ; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:208 ; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:192 ; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:144 ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:128 -; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v3, s47 -; GFX12-NEXT: v_dual_mov_b32 v1, s29 :: v_dual_mov_b32 v2, s46 -; GFX12-NEXT: v_mov_b32_e32 v5, s31 +; GFX12-NEXT: v_dual_mov_b32 v0, s30 :: v_dual_mov_b32 v3, s49 +; GFX12-NEXT: v_dual_mov_b32 v1, s31 :: v_dual_mov_b32 v2, s48 +; GFX12-NEXT: v_mov_b32_e32 v5, s29 ; GFX12-NEXT: s_lshr_b32 s26, s7, 8 ; GFX12-NEXT: s_mov_b32 s24, s7 ; GFX12-NEXT: s_bfe_i64 s[22:23], s[0:1], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v4, s30 :: v_dual_mov_b32 v7, s49 -; GFX12-NEXT: v_dual_mov_b32 v6, s48 :: v_dual_mov_b32 v9, s51 -; GFX12-NEXT: s_lshr_b32 s18, s5, 8 -; GFX12-NEXT: s_mov_b32 s20, s5 +; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v4, s28 :: v_dual_mov_b32 v7, s51 +; GFX12-NEXT: v_dual_mov_b32 v6, s50 :: v_dual_mov_b32 v9, s53 ; GFX12-NEXT: s_lshr_b32 s16, s3, 16 -; GFX12-NEXT: s_lshr_b32 s12, s3, 8 -; GFX12-NEXT: s_mov_b32 s14, s3 -; GFX12-NEXT: s_lshr_b32 s10, s1, 16 ; GFX12-NEXT: s_ashr_i32 s33, s1, 31 -; GFX12-NEXT: s_bfe_i64 s[2:3], s[62:63], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[0:1], s[60:61], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[10:11], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[0:1], s[62:63], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[14:15], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[14:15], s[12:13], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[20:21], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[20:21], s[18:19], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[18:19], s[60:61], 0x80000 ; GFX12-NEXT: s_ashr_i32 s60, s5, 24 ; GFX12-NEXT: s_bfe_i64 s[4:5], s[58:59], 0x80000 ; GFX12-NEXT: s_ashr_i32 s58, s7, 31 ; GFX12-NEXT: s_ashr_i32 s62, s7, 24 ; GFX12-NEXT: s_bfe_i64 s[6:7], s[56:57], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v8, s50 :: v_dual_mov_b32 v11, s53 -; GFX12-NEXT: v_dual_mov_b32 v10, s52 :: v_dual_mov_b32 v13, s23 +; GFX12-NEXT: v_dual_mov_b32 v8, s52 :: v_dual_mov_b32 v11, s55 +; GFX12-NEXT: v_dual_mov_b32 v10, s54 :: v_dual_mov_b32 v13, s23 ; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v15, s55 -; GFX12-NEXT: v_dual_mov_b32 v14, s54 :: v_dual_mov_b32 v17, s7 -; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v15, s35 +; GFX12-NEXT: v_dual_mov_b32 v14, s34 :: v_dual_mov_b32 v17, s7 ; GFX12-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v19, s58 ; GFX12-NEXT: v_dual_mov_b32 v18, s62 :: v_dual_mov_b32 v21, s25 ; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 @@ -9081,20 +9113,16 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v24, v[20:23], s[8:9] offset:224 ; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s63 ; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s60 -; GFX12-NEXT: v_mov_b32_e32 v5, s21 -; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v7, s19 -; GFX12-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s17 -; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v11, s59 -; GFX12-NEXT: v_dual_mov_b32 v10, s61 :: v_dual_mov_b32 v13, s15 -; GFX12-NEXT: v_dual_mov_b32 v12, s14 :: v_dual_mov_b32 v15, s13 -; GFX12-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v17, s11 -; GFX12-NEXT: v_dual_mov_b32 v16, s10 :: v_dual_mov_b32 v19, s33 -; GFX12-NEXT: v_dual_mov_b32 v18, s57 :: v_dual_mov_b32 v21, s3 -; GFX12-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v23, s1 -; GFX12-NEXT: v_mov_b32_e32 v22, s0 +; GFX12-NEXT: v_dual_mov_b32 v5, s21 :: v_dual_mov_b32 v4, s20 +; GFX12-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18 +; GFX12-NEXT: v_dual_mov_b32 v9, s17 :: v_dual_mov_b32 v8, s16 +; GFX12-NEXT: v_dual_mov_b32 v11, s59 :: v_dual_mov_b32 v10, s61 +; GFX12-NEXT: v_dual_mov_b32 v13, s15 :: v_dual_mov_b32 v12, s14 +; GFX12-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12 +; GFX12-NEXT: v_dual_mov_b32 v17, s11 :: v_dual_mov_b32 v16, s10 +; GFX12-NEXT: v_dual_mov_b32 v19, s33 :: v_dual_mov_b32 v18, s57 +; GFX12-NEXT: v_dual_mov_b32 v21, s3 :: v_dual_mov_b32 v20, s2 +; GFX12-NEXT: v_dual_mov_b32 v23, s1 :: v_dual_mov_b32 v22, s0 ; GFX12-NEXT: s_clause 0x5 ; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:176 ; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:160 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 09d3c3b01b809..c374c78c70ceb 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -5982,6 +5982,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) @@ -6005,6 +6006,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6041,6 +6043,7 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[1:2], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) @@ -6359,6 +6362,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) @@ -6393,6 +6397,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6448,6 +6453,7 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) @@ -7010,6 +7016,8 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v2, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: ; implicit-def: $vgpr28_vgpr29 +; GCN-NOHSA-SI-NEXT: ; implicit-def: $vgpr28_vgpr29 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 @@ -7036,6 +7044,8 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GCN-HSA-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -7135,62 +7145,64 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v0, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v9, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v4, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v7 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v11, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v13, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v3, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, v11 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v10, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v6, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v3, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v13, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v7, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v22, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v7, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v11, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v20, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: ; implicit-def: $vgpr15_vgpr16 +; GCN-NOHSA-VI-NEXT: ; implicit-def: $vgpr15_vgpr16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 31, v27 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v30, 31, v29 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 31, v25 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:112 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v16i16_to_v16i64: @@ -8119,7 +8131,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v15 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) @@ -8129,7 +8141,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v7 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v10 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v23, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v15 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v15 @@ -8177,7 +8189,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v14, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v21, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v20, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v19, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v3, v1, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v8, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 @@ -8192,31 +8204,35 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:192 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v4, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v6, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v19, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v6, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v20, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v18, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v17, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v26, v16, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v0, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v17, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v25, v16, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v0, 0, 16 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v2, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v0, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v27, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v29, v0, 0, 16 +; GCN-NOHSA-SI-NEXT: ; implicit-def: $vgpr16_vgpr17 +; GCN-NOHSA-SI-NEXT: ; implicit-def: $vgpr16_vgpr17 +; GCN-NOHSA-SI-NEXT: ; implicit-def: $vgpr16_vgpr17 +; GCN-NOHSA-SI-NEXT: ; implicit-def: $vgpr16_vgpr17 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v29, 31, v28 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v28, 31, v27 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 31, v26 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 31, v30 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v30, 31, v29 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 @@ -8229,6 +8245,10 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: ; implicit-def: $vgpr16_vgpr17 +; GCN-HSA-NEXT: ; implicit-def: $vgpr16_vgpr17 +; GCN-HSA-NEXT: ; implicit-def: $vgpr16_vgpr17 +; GCN-HSA-NEXT: ; implicit-def: $vgpr16_vgpr17 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -8512,6 +8532,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-NOHSA-VI-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 @@ -8520,6 +8541,9 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; GCN-NOHSA-VI-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN-NOHSA-VI-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN-NOHSA-VI-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:64 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll index f879dc660203f..8d1ed92ab3c11 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll @@ -5868,17 +5868,17 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_sextload_v4i8_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v4i8_to_v4i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v4, 24, v0 @@ -5891,8 +5891,8 @@ define amdgpu_kernel void @global_sextload_v4i8_to_v4i64(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v4i8_to_v4i64: @@ -6931,77 +6931,79 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: ; implicit-def: $sgpr6_sgpr7 +; GCN-NOHSA-SI-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v2 ; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v3 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v0 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s7, v1 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s4, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s6, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s6, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s5, 8 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s5 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s7, 8 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s7 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s10, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s11, v1 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s4, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s4, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s10, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s10, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s5, 8 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, s5 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s11, 8 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s11 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[10:11], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s7, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s7, 24 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s5, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s5, 24 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s11, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s11, 24 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[6:7], 0x80000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s5, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s5, 24 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[26:27], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s34 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s33 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s27 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s33 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s39 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s17 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s19 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s23 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s24 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s25 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s13 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s10 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s11 @@ -7021,46 +7023,50 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: ; implicit-def: $sgpr2_sgpr3 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2 -; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v3 -; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v0 -; GCN-HSA-NEXT: v_readfirstlane_b32 s5, v1 -; GCN-HSA-NEXT: s_lshr_b32 s6, s2, 16 -; GCN-HSA-NEXT: s_lshr_b32 s8, s2, 24 -; GCN-HSA-NEXT: s_lshr_b32 s10, s2, 8 -; GCN-HSA-NEXT: s_lshr_b32 s18, s3, 16 -; GCN-HSA-NEXT: s_lshr_b32 s20, s3, 8 -; GCN-HSA-NEXT: s_mov_b32 s22, s3 -; GCN-HSA-NEXT: s_ashr_i32 s7, s3, 31 -; GCN-HSA-NEXT: s_ashr_i32 s9, s3, 24 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 -; GCN-HSA-NEXT: s_lshr_b32 s12, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s14, s4, 24 -; GCN-HSA-NEXT: s_lshr_b32 s16, s4, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 -; GCN-HSA-NEXT: s_ashr_i32 s4, s5, 24 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: s_lshr_b32 s2, s5, 16 -; GCN-HSA-NEXT: s_ashr_i32 s3, s5, 31 +; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v2 +; GCN-HSA-NEXT: v_readfirstlane_b32 s7, v3 +; GCN-HSA-NEXT: v_readfirstlane_b32 s8, v0 +; GCN-HSA-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x80000 +; GCN-HSA-NEXT: s_ashr_i32 s4, s7, 31 +; GCN-HSA-NEXT: s_lshr_b32 s16, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s18, s8, 24 +; GCN-HSA-NEXT: s_lshr_b32 s20, s8, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 +; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[8:9], 0x80000 +; GCN-HSA-NEXT: s_ashr_i32 s8, s7, 24 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s4 +; GCN-HSA-NEXT: s_ashr_i32 s4, s9, 24 +; GCN-HSA-NEXT: s_lshr_b32 s10, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s12, s6, 24 +; GCN-HSA-NEXT: s_lshr_b32 s14, s6, 8 +; GCN-HSA-NEXT: s_lshr_b32 s22, s7, 16 +; GCN-HSA-NEXT: s_mov_b32 s2, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 +; GCN-HSA-NEXT: s_lshr_b32 s6, s7, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s8 +; GCN-HSA-NEXT: s_ashr_i32 s7, s9, 31 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 -; GCN-HSA-NEXT: s_lshr_b32 s4, s5, 8 -; GCN-HSA-NEXT: s_mov_b32 s24, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GCN-HSA-NEXT: s_lshr_b32 s8, s9, 16 +; GCN-HSA-NEXT: s_mov_b32 s4, s9 +; GCN-HSA-NEXT: s_lshr_b32 s24, s9, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s7 ; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 @@ -7068,66 +7074,64 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s10 +; GCN-HSA-NEXT: s_add_u32 s10, s0, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 64 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s11 +; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GCN-HSA-NEXT: s_add_u32 s10, s0, 64 +; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s11 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s19 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s23 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s25 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GCN-HSA-NEXT: s_endpgm @@ -7144,65 +7148,67 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-VI-NEXT: ; implicit-def: $sgpr10_sgpr11 +; GCN-NOHSA-VI-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s4, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s4, 24 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v0 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s4, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s4, 24 ; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v1 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s4, 8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s6, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s6, 24 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s6, 8 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s4, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s8, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s8, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s8, 8 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s5, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s5, 8 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, s5 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s7, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s7, 8 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s30, s7 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s5, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s5, 8 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s5 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s9, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s9, 8 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s9 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[8:9], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x80000 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s29, s7, 31 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s31, s7, 24 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s29, s9, 31 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s31, s9, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s15 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s17 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s17 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s19 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s5, 31 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s38, s5, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[28:29], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[12:13], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[10:11], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[28:29], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s21 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s21 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, s22 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, s23 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s24 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s26 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s38 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s33 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s26 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s27 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s15 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 @@ -8173,168 +8179,172 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: ; implicit-def: $sgpr26_sgpr27 +; GCN-NOHSA-SI-NEXT: ; implicit-def: $sgpr6_sgpr7 +; GCN-NOHSA-SI-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GCN-NOHSA-SI-NEXT: ; implicit-def: $sgpr10_sgpr11 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s18, v2 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s19, v3 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s22, v0 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s23, v1 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s20, v2 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s21, v3 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s24, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s25, v1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s24, v6 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s25, v7 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s12, v4 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s13, v5 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s18, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s18, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s18, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s22, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s22, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s22, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s24, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s24, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s24, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s12, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s12, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s19, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[12:13], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[18:19], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[22:23], 0x80000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s42 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s43 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[24:25], 0x80000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s44 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s45 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s19, 8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s42 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s43 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s40 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s41 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s23, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s39 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s23, 8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s37 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s23 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[12:13], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s22, v6 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s23, v7 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s18, v4 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s19, v5 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s20, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s20, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s20, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s24, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s24, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s24, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s22, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s22, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s22, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s18, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[20:21], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[24:25], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s35 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[22:23], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s37 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[18:19], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s35 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s18, 24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s37 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s18, 8 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[26:27], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s31 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s21, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s35 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s21, 8 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s21 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[6:7], 0x80000 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s37 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s25, 16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s25, 8 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s27 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s25, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s31 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s25, 8 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, s25 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[8:9], 0x80000 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s25, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s23, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s23, 24 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s5, s19, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s7, s19, 24 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s44, s25, 24 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s25 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s35 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s23, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s23 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s29 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s13, 16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s27 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s13, 8 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s23, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s25, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s37, s25, 24 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s5, s21, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s13, s21, 24 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s23, 24 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[10:11], 0x80000 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s23, 8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s29 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s19, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s19 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s45, s13, 31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s46, s19, 31 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s46, s13, 24 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s30, s13 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s47, s19, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s19, 8 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[10:11], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[8:9], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[6:7], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[38:39], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[28:29], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[36:37], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[34:35], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[12:13], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[22:23], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[26:27], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[28:29], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[36:37], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[38:39], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[40:41], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s13 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s41 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s15 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s17 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s21 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s42 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s43 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s44 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s45 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s19 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s44 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s39 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s33 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s42 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s25 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s46 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s45 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s47 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s46 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s27 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s41 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s31 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s29 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s9 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s7 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -8344,6 +8354,9 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 ; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GCN-HSA-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GCN-HSA-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -8353,89 +8366,90 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: ; implicit-def: $sgpr2_sgpr3 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v6 -; GCN-HSA-NEXT: v_readfirstlane_b32 s8, v4 -; GCN-HSA-NEXT: v_readfirstlane_b32 s9, v5 -; GCN-HSA-NEXT: v_readfirstlane_b32 s7, v7 -; GCN-HSA-NEXT: s_lshr_b32 s20, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s18, s6, 24 -; GCN-HSA-NEXT: s_lshr_b32 s10, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s2, s8, 24 -; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x80000 -; GCN-HSA-NEXT: s_lshr_b32 s16, s6, 8 -; GCN-HSA-NEXT: s_lshr_b32 s4, s8, 8 -; GCN-HSA-NEXT: s_lshr_b32 s12, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s14, s7, 8 -; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x80000 -; GCN-HSA-NEXT: s_lshr_b32 s6, s9, 16 -; GCN-HSA-NEXT: s_mov_b32 s28, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s27 -; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[18:19], 0x80000 +; GCN-HSA-NEXT: v_readfirstlane_b32 s10, v6 +; GCN-HSA-NEXT: v_readfirstlane_b32 s12, v4 +; GCN-HSA-NEXT: v_readfirstlane_b32 s11, v7 +; GCN-HSA-NEXT: v_readfirstlane_b32 s13, v5 +; GCN-HSA-NEXT: s_lshr_b32 s2, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s8, s10, 24 +; GCN-HSA-NEXT: s_lshr_b32 s14, s10, 8 +; GCN-HSA-NEXT: s_lshr_b32 s16, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s18, s12, 24 +; GCN-HSA-NEXT: s_lshr_b32 s26, s12, 8 +; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[10:11], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[12:13], 0x80000 +; GCN-HSA-NEXT: s_lshr_b32 s28, s11, 8 +; GCN-HSA-NEXT: s_mov_b32 s4, s11 +; GCN-HSA-NEXT: s_bfe_i64 s[40:41], s[2:3], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[8:9], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[16:17], 0x80000 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_readfirstlane_b32 s40, v2 -; GCN-HSA-NEXT: v_readfirstlane_b32 s41, v3 -; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[10:11], 0x80000 -; GCN-HSA-NEXT: v_readfirstlane_b32 s44, v0 -; GCN-HSA-NEXT: v_readfirstlane_b32 s45, v1 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 -; GCN-HSA-NEXT: s_mov_b32 s22, s7 -; GCN-HSA-NEXT: s_lshr_b32 s8, s9, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25 -; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[16:17], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[28:29], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[14:15], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[12:13], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[4:5], 0x80000 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: s_lshr_b32 s42, s44, 16 -; GCN-HSA-NEXT: s_lshr_b32 s48, s44, 24 -; GCN-HSA-NEXT: s_lshr_b32 s28, s44, 8 -; GCN-HSA-NEXT: s_lshr_b32 s6, s45, 16 -; GCN-HSA-NEXT: s_lshr_b32 s2, s45, 8 -; GCN-HSA-NEXT: s_mov_b32 s4, s45 -; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[44:45], 0x80000 -; GCN-HSA-NEXT: s_lshr_b32 s44, s40, 16 -; GCN-HSA-NEXT: s_lshr_b32 s50, s40, 24 -; GCN-HSA-NEXT: s_lshr_b32 s52, s40, 8 -; GCN-HSA-NEXT: s_lshr_b32 s20, s41, 16 -; GCN-HSA-NEXT: s_lshr_b32 s12, s41, 8 -; GCN-HSA-NEXT: s_mov_b32 s14, s41 -; GCN-HSA-NEXT: s_ashr_i32 s33, s9, 31 -; GCN-HSA-NEXT: s_ashr_i32 s37, s7, 31 -; GCN-HSA-NEXT: s_ashr_i32 s38, s7, 24 -; GCN-HSA-NEXT: s_ashr_i32 s34, s9, 24 -; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s43 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: s_ashr_i32 s30, s45, 31 -; GCN-HSA-NEXT: s_ashr_i32 s31, s45, 24 -; GCN-HSA-NEXT: s_ashr_i32 s35, s41, 31 -; GCN-HSA-NEXT: s_ashr_i32 s36, s41, 24 -; GCN-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 +; GCN-HSA-NEXT: v_readfirstlane_b32 s46, v2 +; GCN-HSA-NEXT: v_readfirstlane_b32 s47, v3 +; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 +; GCN-HSA-NEXT: v_readfirstlane_b32 s48, v0 +; GCN-HSA-NEXT: v_readfirstlane_b32 s49, v1 +; GCN-HSA-NEXT: s_lshr_b32 s24, s11, 16 +; GCN-HSA-NEXT: s_lshr_b32 s30, s13, 16 +; GCN-HSA-NEXT: s_mov_b32 s6, s13 +; GCN-HSA-NEXT: s_lshr_b32 s10, s13, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s23 +; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[28:29], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[26:27], 0x80000 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 +; GCN-HSA-NEXT: s_mov_b32 s2, s47 +; GCN-HSA-NEXT: s_mov_b32 s8, s49 +; GCN-HSA-NEXT: s_lshr_b32 s40, s48, 16 +; GCN-HSA-NEXT: s_lshr_b32 s42, s48, 24 +; GCN-HSA-NEXT: s_lshr_b32 s28, s48, 8 +; GCN-HSA-NEXT: s_lshr_b32 s14, s49, 16 +; GCN-HSA-NEXT: s_lshr_b32 s4, s49, 8 +; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[48:49], 0x80000 +; GCN-HSA-NEXT: s_lshr_b32 s44, s46, 16 +; GCN-HSA-NEXT: s_lshr_b32 s48, s46, 24 +; GCN-HSA-NEXT: s_lshr_b32 s52, s46, 8 +; GCN-HSA-NEXT: s_lshr_b32 s18, s47, 16 +; GCN-HSA-NEXT: s_lshr_b32 s54, s47, 8 +; GCN-HSA-NEXT: s_ashr_i32 s37, s11, 31 +; GCN-HSA-NEXT: s_ashr_i32 s38, s11, 24 +; GCN-HSA-NEXT: s_ashr_i32 s33, s13, 31 +; GCN-HSA-NEXT: s_ashr_i32 s34, s13, 24 +; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[6:7], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[30:31], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s19 +; GCN-HSA-NEXT: s_ashr_i32 s30, s49, 31 +; GCN-HSA-NEXT: s_ashr_i32 s31, s49, 24 +; GCN-HSA-NEXT: s_ashr_i32 s35, s47, 31 +; GCN-HSA-NEXT: s_ashr_i32 s36, s47, 24 +; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[8:9], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[14:15], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[2:3], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[54:55], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 ; GCN-HSA-NEXT: s_add_u32 s54, s0, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0 @@ -8447,19 +8461,20 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v27, s55 ; GCN-HSA-NEXT: s_add_u32 s54, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s40 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s40 ; GCN-HSA-NEXT: s_add_u32 s40, s0, 0xd0 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s41 ; GCN-HSA-NEXT: s_addc_u32 s41, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s54 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s54 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s41 ; GCN-HSA-NEXT: s_add_u32 s40, s0, 0xc0 ; GCN-HSA-NEXT: v_mov_b32_e32 v29, s55 ; GCN-HSA-NEXT: s_addc_u32 s41, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s50 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 ; GCN-HSA-NEXT: s_add_u32 s26, s0, 0x90 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 @@ -8468,8 +8483,7 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s27 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: s_add_u32 s26, s0, 0x80 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s51 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 ; GCN-HSA-NEXT: s_addc_u32 s27, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[4:7] @@ -8477,37 +8491,37 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s24 ; GCN-HSA-NEXT: s_add_u32 s24, s0, 0x70 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s45 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s49 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s40 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s25 ; GCN-HSA-NEXT: s_addc_u32 s25, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s47 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s52 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s43 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s43 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s26 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[16:19] ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s18 -; GCN-HSA-NEXT: s_add_u32 s18, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s20 +; GCN-HSA-NEXT: s_add_u32 s20, s0, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29 ; GCN-HSA-NEXT: v_mov_b32_e32 v27, s27 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s21 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s24 -; GCN-HSA-NEXT: s_addc_u32 s19, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s18 +; GCN-HSA-NEXT: s_addc_u32 s21, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s22 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s38 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s23 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s21 ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] @@ -8521,50 +8535,50 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 -; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xf0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GCN-HSA-NEXT: s_add_u32 s10, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xf0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 +; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xe0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xb0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -8581,151 +8595,155 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: ; implicit-def: $sgpr26_sgpr27 +; GCN-NOHSA-VI-NEXT: ; implicit-def: $sgpr18_sgpr19 +; GCN-NOHSA-VI-NEXT: ; implicit-def: $sgpr10_sgpr11 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v7 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s11, v5 ; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s10, v4 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s13, v7 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s15, v5 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v0 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s12, v6 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s14, v4 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s4, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s4, 24 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s4, 8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s9, 8 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s39, s11, 24 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v1 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v6 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s6, 8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s10, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s10, 24 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s10, 8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s11, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s11, 8 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s66, s11 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[10:11], 0x80000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s4, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s5, 8 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s26, s5 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s9, 8 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s18, s9 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s13, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s13, 8 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s13 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s15, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s39, s15, 24 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s41, s13, 31 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s45, s13, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s8, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s8, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s8, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s12, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s12, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s12, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s14, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s14, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s14, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s9, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s15, 8 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s15 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[14:15], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[12:13], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[8:9], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[68:69], s[4:5], 0x80000 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s11, 31 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[60:61], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[38:39], 0x80000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s8, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s8, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s15, 31 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s67, s9, 31 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s70, s9, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[64:65], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[10:11], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[62:63], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[20:21], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[18:19], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[28:29], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[26:27], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[60:61], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[44:45], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[62:63], s[40:41], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[64:65], s[38:39], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s68 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s69 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s60 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s61 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s40 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s41 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s42 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s8, 8 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s65 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s62 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s63 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s60 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[8:9], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s52 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s5, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s54 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s5, 8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s58 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s26, s5 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s71, s5, 31 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s42 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s72, s5, 24 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s7, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s36 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s37 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s7, 8 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s72 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s71 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s18, s7 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s67, s7, 31 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s70, s7, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s9, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s25 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[66:67], 0x80000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s26 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s27 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, s9 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s70 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s67 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s63, s9, 31 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s65, s9, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s17 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s19 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s6, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s46 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s65 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s6, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[62:63], 0x80000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s45 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[66:67], 0x80000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s47 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[64:65], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s44 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s39 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s45 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s46 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s47 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s51 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:144 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll index ddd1ce66c013a..76cd984eecd92 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll @@ -5736,6 +5736,7 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_b64 v[0:1], v0 +; SI-NEXT: ; implicit-def: $vgpr2_vgpr3 ; SI-NEXT: v_mov_b32_e32 v9, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v3, v1 @@ -5756,6 +5757,7 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 +; VI-NO-DS128-NEXT: ; implicit-def: $vgpr2_vgpr3 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-NO-DS128-NEXT: ds_read_b64 v[0:1], v0 @@ -5778,6 +5780,7 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ; GFX9-NO-DS128-LABEL: local_sextload_v4i16_to_v4i64: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NO-DS128-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NO-DS128-NEXT: ds_read_b64 v[0:1], v0 @@ -5846,6 +5849,7 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-DS128-NEXT: s_mov_b32 m0, -1 +; VI-DS128-NEXT: ; implicit-def: $vgpr2_vgpr3 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-DS128-NEXT: ds_read_b64 v[0:1], v0 @@ -5869,6 +5873,7 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ; GFX9-DS128-LABEL: local_sextload_v4i16_to_v4i64: ; GFX9-DS128: ; %bb.0: ; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DS128-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-DS128-NEXT: ds_read_b64 v[0:1], v0 @@ -6140,6 +6145,7 @@ define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out, ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 +; SI-NEXT: ; implicit-def: $vgpr4_vgpr5 ; SI-NEXT: v_mov_b32_e32 v16, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, v3 @@ -6171,6 +6177,7 @@ define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out, ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 +; VI-NO-DS128-NEXT: ; implicit-def: $vgpr16_vgpr17 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 @@ -6206,6 +6213,7 @@ define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out, ; GFX9-NO-DS128-LABEL: local_sextload_v8i16_to_v8i64: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NO-DS128-NEXT: ; implicit-def: $vgpr16_vgpr17 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 @@ -6335,17 +6343,18 @@ define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out, ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-DS128-NEXT: v_bfe_i32 v8, v1, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-DS128-NEXT: v_bfe_i32 v6, v0, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; VI-DS128-NEXT: v_bfe_i32 v12, v2, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v10, v1, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v14, v0, 0, 16 +; VI-DS128-NEXT: ; implicit-def: $vgpr0_vgpr1 +; VI-DS128-NEXT: v_bfe_i32 v12, v2, 0, 16 ; VI-DS128-NEXT: v_mov_b32_e32 v0, v3 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; VI-DS128-NEXT: v_bfe_i32 v8, v1, 0, 16 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v10, v1, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 @@ -6370,17 +6379,18 @@ define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out, ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-DS128-NEXT: v_bfe_i32 v8, v1, 0, 16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-DS128-NEXT: v_bfe_i32 v6, v0, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX9-DS128-NEXT: v_bfe_i32 v12, v2, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v10, v1, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v14, v0, 0, 16 +; GFX9-DS128-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-DS128-NEXT: v_bfe_i32 v12, v2, 0, 16 ; GFX9-DS128-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX9-DS128-NEXT: v_bfe_i32 v8, v1, 0, 16 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v10, v1, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 @@ -6816,6 +6826,8 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read2_b64 v[0:3], v4 offset0:2 offset1:3 ; SI-NEXT: ds_read2_b64 v[4:7], v4 offset1:1 +; SI-NEXT: ; implicit-def: $vgpr8_vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8_vgpr9 ; SI-NEXT: v_mov_b32_e32 v18, s0 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: v_mov_b32_e32 v12, v3 @@ -6874,60 +6886,62 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 -; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_read2_b64 v[8:11], v4 offset0:2 offset1:3 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v19, s0 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v3 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v8 ; VI-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v16, v8, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v9 ; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[16:17], v[14:15] offset0:8 offset1:9 -; VI-NO-DS128-NEXT: v_bfe_i32 v14, v4, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v4, v5, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v14, v8, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v8, v9, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[4:5], v[14:15] offset0:10 offset1:11 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; VI-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v14, v6, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[8:9], v[14:15] offset0:10 offset1:11 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v10 +; VI-NO-DS128-NEXT: v_bfe_i32 v8, v8, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v14, v10, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[4:5] offset0:12 offset1:13 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, v7 +; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[8:9] offset0:12 offset1:13 +; VI-NO-DS128-NEXT: v_bfe_i32 v8, v18, 0, 16 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v18, v11 +; VI-NO-DS128-NEXT: ; implicit-def: $vgpr16_vgpr17 ; VI-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v4, v18, 0, 16 -; VI-NO-DS128-NEXT: v_mov_b32_e32 v18, v3 +; VI-NO-DS128-NEXT: v_bfe_i32 v16, v18, 0, 16 +; VI-NO-DS128-NEXT: v_mov_b32_e32 v20, v3 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[16:17], v[14:15] offset0:14 offset1:15 -; VI-NO-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v8, v8, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v10, v9, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v12, v11, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; VI-NO-DS128-NEXT: v_bfe_i32 v14, v20, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v6, v5, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v12, v7, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; VI-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 -; VI-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 +; VI-NO-DS128-NEXT: v_bfe_i32 v10, v1, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[4:5] offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ; implicit-def: $vgpr16_vgpr17 +; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[8:9] offset0:6 offset1:7 ; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[2:3], v[12:13] offset0:4 offset1:5 -; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[6:7], v[10:11] offset0:2 offset1:3 -; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[0:1], v[8:9] offset1:1 +; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[10:11], v[6:7] offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[0:1], v[4:5] offset1:1 ; VI-NO-DS128-NEXT: s_endpgm ; ; GFX9-NO-DS128-LABEL: local_sextload_v16i16_to_v16i64: @@ -6936,60 +6950,62 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[8:11], v4 offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v19, s0 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v3 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v8 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v8, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v9 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[16:17], v[14:15] offset0:8 offset1:9 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v4, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v5, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v8, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v9, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[4:5], v[14:15] offset0:10 offset1:11 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v6 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v6, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[8:9], v[14:15] offset0:10 offset1:11 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v10 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v8, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v10, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[4:5] offset0:12 offset1:13 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v16, v7 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[8:9] offset0:12 offset1:13 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v18, 0, 16 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v18, v11 +; GFX9-NO-DS128-NEXT: ; implicit-def: $vgpr16_vgpr17 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v18, 0, 16 -; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v18, v3 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v18, 0, 16 +; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v20, v3 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[16:17], v[14:15] offset0:14 offset1:15 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v8, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v9, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v11, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v20, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v5, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v7, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v1, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[4:5] offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: ; implicit-def: $vgpr16_vgpr17 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[8:9] offset0:6 offset1:7 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[2:3], v[12:13] offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[6:7], v[10:11] offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[0:1], v[8:9] offset1:1 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[10:11], v[6:7] offset0:2 offset1:3 +; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[0:1], v[4:5] offset1:1 ; GFX9-NO-DS128-NEXT: s_endpgm ; ; EG-LABEL: local_sextload_v16i16_to_v16i64: @@ -7171,16 +7187,16 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; VI-DS128-NEXT: v_mov_b32_e32 v0, s1 ; VI-DS128-NEXT: ds_read_b128 v[3:6], v0 ; VI-DS128-NEXT: ds_read_b128 v[7:10], v0 offset:16 +; VI-DS128-NEXT: v_mov_b32_e32 v18, s0 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-DS128-NEXT: v_mov_b32_e32 v18, v6 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v6 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_bfe_i32 v11, v8, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; VI-DS128-NEXT: v_bfe_i32 v13, v8, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; VI-DS128-NEXT: v_mov_b32_e32 v8, s0 -; VI-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:80 +; VI-DS128-NEXT: ds_write_b128 v18, v[11:14] offset:80 ; VI-DS128-NEXT: v_bfe_i32 v11, v7, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; VI-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16 @@ -7188,24 +7204,25 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v10 -; VI-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:64 +; VI-DS128-NEXT: ds_write_b128 v18, v[11:14] offset:64 ; VI-DS128-NEXT: v_bfe_i32 v11, v15, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; VI-DS128-NEXT: ds_write_b128 v18, v[11:14] offset:112 +; VI-DS128-NEXT: ; implicit-def: $vgpr14_vgpr15 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v9 -; VI-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:112 ; VI-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; VI-DS128-NEXT: v_mov_b32_e32 v8, v6 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; VI-DS128-NEXT: v_bfe_i32 v10, v4, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; VI-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:96 -; VI-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16 +; VI-DS128-NEXT: ds_write_b128 v18, v[14:17] offset:96 +; VI-DS128-NEXT: v_bfe_i32 v14, v8, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v16, v19, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v0, v3, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 @@ -7220,10 +7237,11 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; VI-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:48 -; VI-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:32 -; VI-DS128-NEXT: ds_write_b128 v8, v[10:13] offset:16 -; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] +; VI-DS128-NEXT: ; implicit-def: $vgpr8_vgpr9 +; VI-DS128-NEXT: ds_write_b128 v18, v[14:17] offset:48 +; VI-DS128-NEXT: ds_write_b128 v18, v[4:7] offset:32 +; VI-DS128-NEXT: ds_write_b128 v18, v[10:13] offset:16 +; VI-DS128-NEXT: ds_write_b128 v18, v[0:3] ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_sextload_v16i16_to_v16i64: @@ -7233,6 +7251,7 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-DS128-NEXT: ds_read_b128 v[3:6], v0 ; GFX9-DS128-NEXT: ds_read_b128 v[7:10], v0 offset:16 +; GFX9-DS128-NEXT: v_mov_b32_e32 v18, s0 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_bfe_i32 v0, v3, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3 @@ -7242,8 +7261,7 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_bfe_i32 v13, v3, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GFX9-DS128-NEXT: v_mov_b32_e32 v8, s0 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:80 +; GFX9-DS128-NEXT: ds_write_b128 v18, v[11:14] offset:80 ; GFX9-DS128-NEXT: v_bfe_i32 v11, v7, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX9-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16 @@ -7251,24 +7269,25 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v10 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:64 +; GFX9-DS128-NEXT: ds_write_b128 v18, v[11:14] offset:64 ; GFX9-DS128-NEXT: v_bfe_i32 v11, v15, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GFX9-DS128-NEXT: ds_write_b128 v18, v[11:14] offset:112 +; GFX9-DS128-NEXT: ; implicit-def: $vgpr14_vgpr15 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v9 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:112 ; GFX9-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16 -; GFX9-DS128-NEXT: v_mov_b32_e32 v18, v6 +; GFX9-DS128-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v6 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GFX9-DS128-NEXT: v_bfe_i32 v10, v4, 0, 16 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:96 -; GFX9-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16 +; GFX9-DS128-NEXT: ds_write_b128 v18, v[14:17] offset:96 +; GFX9-DS128-NEXT: v_bfe_i32 v14, v8, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v16, v19, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v12, v4, 0, 16 @@ -7282,10 +7301,11 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:48 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:32 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[10:13] offset:16 -; GFX9-DS128-NEXT: ds_write_b128 v8, v[0:3] +; GFX9-DS128-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX9-DS128-NEXT: ds_write_b128 v18, v[14:17] offset:48 +; GFX9-DS128-NEXT: ds_write_b128 v18, v[4:7] offset:32 +; GFX9-DS128-NEXT: ds_write_b128 v18, v[10:13] offset:16 +; GFX9-DS128-NEXT: ds_write_b128 v18, v[0:3] ; GFX9-DS128-NEXT: s_endpgm %load = load <16 x i16>, ptr addrspace(3) %in %ext = sext <16 x i16> %load to <16 x i64> @@ -8092,7 +8112,7 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; SI-NEXT: ds_write2_b64 v7, v[5:6], v[15:16] offset0:12 offset1:13 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 ; SI-NEXT: v_bfe_i32 v3, v4, 0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v8 @@ -8109,7 +8129,11 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; SI-NEXT: v_bfe_i32 v9, v0, 0, 16 ; SI-NEXT: v_bfe_i32 v10, v2, 0, 16 -; SI-NEXT: v_bfe_i32 v12, v11, 0, 16 +; SI-NEXT: ; implicit-def: $vgpr11_vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11_vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11_vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11_vgpr12 +; SI-NEXT: v_bfe_i32 v12, v13, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; SI-NEXT: ds_write2_b64 v7, v[10:11], v[12:13] offset0:4 offset1:5 @@ -8139,6 +8163,7 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 +; VI-NO-DS128-NEXT: ; implicit-def: $vgpr23_vgpr24 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v7, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v7 offset0:6 offset1:7 @@ -8242,6 +8267,9 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; VI-NO-DS128-NEXT: ; implicit-def: $vgpr21_vgpr22 +; VI-NO-DS128-NEXT: ; implicit-def: $vgpr21_vgpr22 +; VI-NO-DS128-NEXT: ; implicit-def: $vgpr21_vgpr22 ; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[19:20] offset0:6 offset1:7 ; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[8:9], v[15:16] offset0:4 offset1:5 ; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[6:7], v[13:14] offset0:2 offset1:3 @@ -8251,6 +8279,7 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; GFX9-NO-DS128-LABEL: local_sextload_v32i16_to_v32i64: ; GFX9-NO-DS128: ; %bb.0: ; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NO-DS128-NEXT: ; implicit-def: $vgpr22_vgpr23 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v8 offset0:6 offset1:7 @@ -8354,6 +8383,9 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GFX9-NO-DS128-NEXT: ; implicit-def: $vgpr20_vgpr21 +; GFX9-NO-DS128-NEXT: ; implicit-def: $vgpr20_vgpr21 +; GFX9-NO-DS128-NEXT: ; implicit-def: $vgpr20_vgpr21 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[18:19] offset0:6 offset1:7 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[10:11], v[12:13] offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[8:9], v[1:2] offset0:2 offset1:3 @@ -8788,13 +8820,14 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; VI-DS128-NEXT: v_bfe_i32 v11, v5, 0, 16 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v18 ; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:64 ; VI-DS128-NEXT: v_bfe_i32 v9, v4, 0, 16 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-DS128-NEXT: v_bfe_i32 v11, v4, 0, 16 +; VI-DS128-NEXT: ; implicit-def: $vgpr4_vgpr5 ; VI-DS128-NEXT: v_bfe_i32 v13, v18, 0, 16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v18 ; VI-DS128-NEXT: v_bfe_i32 v15, v5, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v11, v4, 0, 16 ; VI-DS128-NEXT: v_mov_b32_e32 v4, v7 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 @@ -8814,6 +8847,9 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; VI-DS128-NEXT: ; implicit-def: $vgpr17_vgpr18 +; VI-DS128-NEXT: ; implicit-def: $vgpr17_vgpr18 +; VI-DS128-NEXT: ; implicit-def: $vgpr17_vgpr18 ; VI-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:32 ; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:48 ; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] @@ -8899,17 +8935,18 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v18 ; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:112 -; GFX9-DS128-NEXT: v_bfe_i32 v13, v18, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; GFX9-DS128-NEXT: v_bfe_i32 v13, v18, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v6, v8, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v8, v0, 0, 16 +; GFX9-DS128-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v19 ; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:64 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v8 ; GFX9-DS128-NEXT: v_bfe_i32 v13, v19, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v15, v1, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v6, v8, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v8, v0, 0, 16 ; GFX9-DS128-NEXT: v_mov_b32_e32 v0, v11 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 @@ -8929,6 +8966,9 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GFX9-DS128-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-DS128-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-DS128-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-DS128-NEXT: ds_write_b128 v12, v[17:20] offset:32 ; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:48 ; GFX9-DS128-NEXT: ds_write_b128 v12, v[6:9] diff --git a/llvm/test/CodeGen/AMDGPU/lround.ll b/llvm/test/CodeGen/AMDGPU/lround.ll index d8d8308f6cd8a..6c65fbad7a067 100644 --- a/llvm/test/CodeGen/AMDGPU/lround.ll +++ b/llvm/test/CodeGen/AMDGPU/lround.ll @@ -822,14 +822,16 @@ define half @intrinsic_fround_half(half %arg) { ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_trunc_f16_e32 v1.h, v1.l ; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e32 v1.l, v1.l, v1.h ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_cmp_ge_f16_e64 s0, |v1.l|, 0.5 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, 0, 0x3c00, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0, 0x3c00, s0 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0 +; GFX11-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v2 ; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.l, v1.h, v0.l ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -943,14 +945,16 @@ define i32 @intrinsic_lround_i32_f16(half %arg) { ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_trunc_f16_e32 v1.h, v1.l ; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e32 v1.l, v1.l, v1.h ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_cmp_ge_f16_e64 s0, |v1.l|, 0.5 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, 0, 0x3c00, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0, 0x3c00, s0 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0 +; GFX11-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v2 ; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.l, v1.h, v0.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll index 1ae3434db6da5..dd31246dd3f1e 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll @@ -137,13 +137,23 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %s } define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src1, half %src2, half %lo) #0 { -; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: +; SDAG-GFX11-TRUE16: ; %bb.0: +; SDAG-GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 +; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l +; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX11-TRUE16-NEXT: v_fma_mixhi_f16 v4, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: +; SDAG-GFX11-FAKE16: ; %bb.0: +; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-FAKE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: ; GFX9: ; %bb.0: @@ -172,6 +182,14 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src ; SDAG-CI-NEXT: v_mov_b32_e32 v0, v3 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll index eab92668c536b..6c162a55f59eb 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -411,9 +411,10 @@ define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half ; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32: ; SDAG-GFX1100-TRUE16: ; %bb.0: ; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v4, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: ; implicit-def: $vgpr3 ; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.h, v4.l ; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] ; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3 @@ -534,10 +535,11 @@ define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half ; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v3f32: ; SDAG-GFX1100-TRUE16: ; %bb.0: ; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v7, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: ; implicit-def: $vgpr6 ; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] ; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.h, v7.l ; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] ; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v6 @@ -704,11 +706,13 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half ; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v4f32: ; SDAG-GFX1100-TRUE16: ; %bb.0: ; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v8, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v9, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: ; implicit-def: $vgpr6 +; SDAG-GFX1100-TRUE16-NEXT: ; implicit-def: $vgpr7 ; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l -; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v7.h, v9.l ; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] ; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] @@ -1452,9 +1456,12 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half> ; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: ; SDAG-GFX1100-TRUE16: ; %bb.0: ; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v4, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: ; implicit-def: $vgpr3 ; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l ; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1617,9 +1624,12 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half> ; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: ; SDAG-GFX1100-TRUE16: ; %bb.0: ; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v4, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: ; implicit-def: $vgpr3 ; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l ; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index e6960a3f710da..47dd7638f2588 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -623,6 +623,7 @@ define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v0 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v4, v[2:3] +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v4, v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v2 @@ -634,20 +635,24 @@ define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GFX1100-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3] -; GFX1100-NEXT: v_ashrrev_i32_e32 v5, 31, v5 -; GFX1100-NEXT: v_mov_b32_e32 v3, v1 +; GFX1100-NEXT: v_ashrrev_i32_e32 v3, 31, v5 +; GFX1100-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GFX1100-NEXT: v_mov_b32_e32 v5, v1 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_mad_u64_u32 v[1:2], null, v5, v4, v[3:4] +; GFX1100-NEXT: v_mad_u64_u32 v[1:2], null, v3, v4, v[5:6] ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX1150-LABEL: mad_i64_i32_extops_i32_i64: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1150-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3] -; GFX1150-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX1150-NEXT: v_mad_u64_u32 v[1:2], null, v2, v4, v[1:2] +; GFX1150-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1150-NEXT: v_ashrrev_i32_e32 v5, 31, v5 +; GFX1150-NEXT: v_mov_b32_e32 v2, v1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_mad_u64_u32 v[1:2], null, v5, v4, v[2:3] ; GFX1150-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: mad_i64_i32_extops_i32_i64: @@ -658,10 +663,13 @@ define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v4, v[2:3] -; GFX12-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v2, v4, v[1:2] +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: v_ashrrev_i32_e32 v5, 31, v5 +; GFX12-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v5, v4, v[2:3] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: mad_i64_i32_extops_i32_i64: @@ -767,6 +775,7 @@ define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) # ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v3, 1, v1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5] +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v2, v[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, v2 @@ -777,11 +786,13 @@ define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) # ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v0 ; GFX1100-NEXT: v_mov_b32_e32 v6, v1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v2, v3, v[4:5] -; GFX1100-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_and_b32 v5, 1, v6 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5] +; GFX1100-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1100-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_mov_b32_e32 v4, v1 +; GFX1100-NEXT: v_mad_u64_u32 v[1:2], null, v6, v3, v[4:5] ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX1150-LABEL: mad_u64_u32_bitops_lhs_mask_small: @@ -789,9 +800,12 @@ define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) # ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1150-NEXT: v_mov_b32_e32 v3, v1 ; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, v0, v2, v[4:5] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX1150-NEXT: v_mad_u64_u32 v[1:2], null, v3, v2, v[1:2] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1150-NEXT: v_and_b32_e32 v5, 1, v3 +; GFX1150-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1150-NEXT: v_mov_b32_e32 v3, v1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_mad_u64_u32 v[1:2], null, v5, v2, v[3:4] ; GFX1150-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: mad_u64_u32_bitops_lhs_mask_small: @@ -803,9 +817,12 @@ define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) # ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v3, v1 ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v2, v[4:5] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v3, v2, v[1:2] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_and_b32_e32 v5, 1, v3 +; GFX12-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX12-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v5, v2, v[3:4] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: mad_u64_u32_bitops_lhs_mask_small: @@ -852,10 +869,11 @@ define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) # ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_and_b32_e32 v7, 1, v3 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, v[4:5] -; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v3, v[2:3] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -863,21 +881,26 @@ define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) # ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-NEXT: v_mov_b32_e32 v6, v0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, v[4:5] -; GFX1100-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v4, 1, v3 +; GFX1100-NEXT: v_and_b32_e32 v5, 1, v3 +; GFX1100-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1100-NEXT: v_mov_b32_e32 v3, v1 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[3:4] +; GFX1100-NEXT: v_mad_u64_u32 v[1:2], null, v6, v5, v[3:4] ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX1150-LABEL: mad_u64_u32_bitops_rhs_mask_small: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1150-NEXT: v_mov_b32_e32 v6, v0 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, v[4:5] -; GFX1150-NEXT: v_and_b32_e32 v2, 1, v3 -; GFX1150-NEXT: v_mad_u64_u32 v[1:2], null, v6, v2, v[1:2] +; GFX1150-NEXT: v_and_b32_e32 v4, 1, v3 +; GFX1150-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1150-NEXT: v_mov_b32_e32 v2, v1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[2:3] ; GFX1150-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: mad_u64_u32_bitops_rhs_mask_small: @@ -888,10 +911,13 @@ define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) # ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mov_b32_e32 v6, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v6, v2, v[4:5] -; GFX12-NEXT: v_and_b32_e32 v2, 1, v3 -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v6, v2, v[1:2] +; GFX12-NEXT: v_and_b32_e32 v4, 1, v3 +; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX12-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v6, v4, v[2:3] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: mad_u64_u32_bitops_rhs_mask_small: @@ -1795,6 +1821,7 @@ define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v0, 0 +; GFX9-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX9-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v0, v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v5, 0 @@ -1806,20 +1833,22 @@ define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 { ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-NEXT: v_mad_u64_u32 v[3:4], null, v1, v0, 0 +; GFX1100-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, 0 -; GFX1100-NEXT: v_mad_u64_u32 v[5:6], null, v2, v0, v[1:2] +; GFX1100-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, 0 +; GFX1100-NEXT: v_mad_u64_u32 v[7:8], null, v2, v0, v[5:6] ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, 0xfffffc88, v5, v[3:4] +; GFX1100-NEXT: v_mad_u64_u32 v[0:1], null, 0xfffffc88, v7, v[3:4] ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX1150-LABEL: lshr_mad_i64_4: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1150-NEXT: v_mad_u64_u32 v[3:4], null, v1, v0, 0 +; GFX1150-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, 0 -; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, v2, v0, v[1:2] +; GFX1150-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, 0 +; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, v2, v0, v[5:6] ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1150-NEXT: v_mad_u64_u32 v[0:1], null, 0xfffffc88, v0, v[3:4] ; GFX1150-NEXT: s_setpc_b64 s[30:31] @@ -1832,9 +1861,10 @@ define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v1, v0, 0 +; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v0, v[1:2] +; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v0, v[5:6] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, 0xfffffc88, v0, v[3:4] ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2116,9 +2146,10 @@ define i64 @lshr_mad_i64_negative_4(i64 %arg0) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v0, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v1, v[0:1] +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v1, v[4:5] ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2126,22 +2157,22 @@ define i64 @lshr_mad_i64_negative_4(i64 %arg0) #0 { ; GFX1100: ; %bb.0: ; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1100-NEXT: v_mad_u64_u32 v[2:3], null, v1, v0, v[0:1] +; GFX1100-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_mov_b32_e32 v0, v3 -; GFX1100-NEXT: v_mad_u64_u32 v[3:4], null, v1, v1, v[0:1] +; GFX1100-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v0, v2 +; GFX1100-NEXT: v_mad_u64_u32 v[3:4], null, v1, v1, v[5:6] ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX1100-NEXT: v_mov_b32_e32 v1, v3 ; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX1150-LABEL: lshr_mad_i64_negative_4: ; GFX1150: ; %bb.0: ; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1150-NEXT: v_mad_u64_u32 v[3:4], null, v1, v0, v[0:1] +; GFX1150-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_mov_b32_e32 v0, v4 -; GFX1150-NEXT: v_mad_u64_u32 v[1:2], null, v1, v1, v[0:1] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1150-NEXT: v_mov_b32_e32 v0, v3 +; GFX1150-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v0, v3 +; GFX1150-NEXT: v_mad_u64_u32 v[1:2], null, v1, v1, v[5:6] ; GFX1150-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: lshr_mad_i64_negative_4: @@ -2152,11 +2183,10 @@ define i64 @lshr_mad_i64_negative_4(i64 %arg0) #0 { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v1, v0, v[0:1] +; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v0, v4 -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v1, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v0, v3 +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v1, v[5:6] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: lshr_mad_i64_negative_4: diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index 9f27e1ffd9130..a086503dd7664 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -8,6 +8,7 @@ ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250,GFX1250-GISEL %s define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) { +; ; GFX900-LABEL: fadd_v2_vv: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -51,6 +52,7 @@ define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) { } define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { +; ; GFX900-LABEL: fadd_v2_vs: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -96,6 +98,7 @@ define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { } define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { +; ; GFX900-LABEL: fadd_v4_vs: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 @@ -181,6 +184,7 @@ define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { } define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { +; ; GFX900-LABEL: fadd_v32_vs: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -485,6 +489,7 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; FIXME: GISel does not use op_sel for splat constants. define amdgpu_kernel void @fadd_v2_v_imm(ptr addrspace(1) %a) { +; ; GFX900-LABEL: fadd_v2_v_imm: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -559,6 +564,7 @@ define amdgpu_kernel void @fadd_v2_v_imm(ptr addrspace(1) %a) { } define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) { +; ; GFX900-LABEL: fadd_v2_v_v_splat: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -631,6 +637,7 @@ define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) { } define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) { +; ; GFX900-LABEL: fadd_v2_v_lit_splat: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -703,6 +710,7 @@ define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) { } define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) { +; ; GFX900-LABEL: fadd_v2_v_lit_hi0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -761,6 +769,7 @@ define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) { } define amdgpu_kernel void @fadd_v2_v_lit_lo0(ptr addrspace(1) %a) { +; ; GFX900-LABEL: fadd_v2_v_lit_lo0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -821,6 +830,7 @@ define amdgpu_kernel void @fadd_v2_v_lit_lo0(ptr addrspace(1) %a) { } define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(ptr addrspace(1) %a) { +; ; GFX900-LABEL: fadd_v2_v_unfoldable_lit: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -882,6 +892,7 @@ define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(ptr addrspace(1) %a) { ; FIXME: Fold fneg into v_pk_add_f32 with Global ISel. define amdgpu_kernel void @fadd_v2_v_fneg(ptr addrspace(1) %a, float %x) { +; ; GFX900-LABEL: fadd_v2_v_fneg: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -927,10 +938,12 @@ define amdgpu_kernel void @fadd_v2_v_fneg(ptr addrspace(1) %a, float %x) { ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-SDAG-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_mov_b32 s4, s2 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[4:5] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1] ; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset ; GFX1250-SDAG-NEXT: s_endpgm ; @@ -959,6 +972,7 @@ define amdgpu_kernel void @fadd_v2_v_fneg(ptr addrspace(1) %a, float %x) { } define amdgpu_kernel void @fadd_v2_v_fneg_lo(ptr addrspace(1) %a, float %x) { +; ; GFX900-LABEL: fadd_v2_v_fneg_lo: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -1004,10 +1018,12 @@ define amdgpu_kernel void @fadd_v2_v_fneg_lo(ptr addrspace(1) %a, float %x) { ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-SDAG-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_mov_b32 s4, s2 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_lo:[0,1] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[4:5] op_sel_hi:[1,0] neg_lo:[0,1] ; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset ; GFX1250-SDAG-NEXT: s_endpgm ; @@ -1036,6 +1052,7 @@ define amdgpu_kernel void @fadd_v2_v_fneg_lo(ptr addrspace(1) %a, float %x) { } define amdgpu_kernel void @fadd_v2_v_fneg_hi(ptr addrspace(1) %a, float %x) { +; ; GFX900-LABEL: fadd_v2_v_fneg_hi: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -1081,10 +1098,12 @@ define amdgpu_kernel void @fadd_v2_v_fneg_hi(ptr addrspace(1) %a, float %x) { ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-SDAG-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_mov_b32 s4, s2 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_hi:[0,1] +; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[4:5] op_sel_hi:[1,0] neg_hi:[0,1] ; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset ; GFX1250-SDAG-NEXT: s_endpgm ; @@ -1113,6 +1132,7 @@ define amdgpu_kernel void @fadd_v2_v_fneg_hi(ptr addrspace(1) %a, float %x) { } define amdgpu_kernel void @fadd_v2_v_fneg_lo2(ptr addrspace(1) %a, float %x, float %y) { +; ; GFX900-LABEL: fadd_v2_v_fneg_lo2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -1189,6 +1209,7 @@ define amdgpu_kernel void @fadd_v2_v_fneg_lo2(ptr addrspace(1) %a, float %x, flo } define amdgpu_kernel void @fadd_v2_v_fneg_hi2(ptr addrspace(1) %a, float %x, float %y) { +; ; GFX900-LABEL: fadd_v2_v_fneg_hi2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -1265,6 +1286,7 @@ define amdgpu_kernel void @fadd_v2_v_fneg_hi2(ptr addrspace(1) %a, float %x, flo } define amdgpu_kernel void @fmul_v2_vv(ptr addrspace(1) %a) { +; ; GFX900-LABEL: fmul_v2_vv: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -1308,6 +1330,7 @@ define amdgpu_kernel void @fmul_v2_vv(ptr addrspace(1) %a) { } define amdgpu_kernel void @fmul_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { +; ; GFX900-LABEL: fmul_v2_vs: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -1353,6 +1376,7 @@ define amdgpu_kernel void @fmul_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { } define amdgpu_kernel void @fmul_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { +; ; GFX900-LABEL: fmul_v4_vs: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 @@ -1438,6 +1462,7 @@ define amdgpu_kernel void @fmul_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { } define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { +; ; GFX900-LABEL: fmul_v32_vs: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -1741,6 +1766,7 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { } define amdgpu_kernel void @fmul_v2_v_imm(ptr addrspace(1) %a) { +; ; GFX900-LABEL: fmul_v2_v_imm: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -1815,6 +1841,7 @@ define amdgpu_kernel void @fmul_v2_v_imm(ptr addrspace(1) %a) { } define amdgpu_kernel void @fmul_v2_v_v_splat(ptr addrspace(1) %a) { +; ; GFX900-LABEL: fmul_v2_v_v_splat: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -1887,6 +1914,7 @@ define amdgpu_kernel void @fmul_v2_v_v_splat(ptr addrspace(1) %a) { } define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) { +; ; GFX900-LABEL: fmul_v2_v_lit_splat: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -1959,6 +1987,7 @@ define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) { } define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(ptr addrspace(1) %a) { +; ; GFX900-LABEL: fmul_v2_v_unfoldable_lit: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -2019,6 +2048,7 @@ define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(ptr addrspace(1) %a) { } define amdgpu_kernel void @fmul_v2_v_fneg(ptr addrspace(1) %a, float %x) { +; ; GFX900-LABEL: fmul_v2_v_fneg: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -2064,10 +2094,12 @@ define amdgpu_kernel void @fmul_v2_v_fneg(ptr addrspace(1) %a, float %x) { ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-SDAG-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_mov_b32 s4, s2 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[4:5] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1] ; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset ; GFX1250-SDAG-NEXT: s_endpgm ; @@ -2096,6 +2128,7 @@ define amdgpu_kernel void @fmul_v2_v_fneg(ptr addrspace(1) %a, float %x) { } define amdgpu_kernel void @fma_v2_vv(ptr addrspace(1) %a) { +; ; GFX900-LABEL: fma_v2_vv: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -2139,6 +2172,7 @@ define amdgpu_kernel void @fma_v2_vv(ptr addrspace(1) %a) { } define amdgpu_kernel void @fma_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { +; ; GFX900-LABEL: fma_v2_vs: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -2184,6 +2218,7 @@ define amdgpu_kernel void @fma_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { } define amdgpu_kernel void @fma_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { +; ; GFX900-LABEL: fma_v4_vs: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 @@ -2269,6 +2304,7 @@ define amdgpu_kernel void @fma_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { } define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { +; ; GFX900-LABEL: fma_v32_vs: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -2571,6 +2607,7 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { } define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) { +; ; GFX900-LABEL: fma_v2_v_imm: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -2673,6 +2710,7 @@ define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) { } define amdgpu_kernel void @fma_v2_v_v_splat(ptr addrspace(1) %a) { +; ; GFX900-LABEL: fma_v2_v_v_splat: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -2745,6 +2783,7 @@ define amdgpu_kernel void @fma_v2_v_v_splat(ptr addrspace(1) %a) { } define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) { +; ; GFX900-LABEL: fma_v2_v_lit_splat: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -2840,6 +2879,7 @@ define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) { } define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) { +; ; GFX900-LABEL: fma_v2_v_unfoldable_lit: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -2940,6 +2980,7 @@ define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) { } define amdgpu_kernel void @fma_v2_v_fneg(ptr addrspace(1) %a, float %x) { +; ; GFX900-LABEL: fma_v2_v_fneg: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -2985,10 +3026,12 @@ define amdgpu_kernel void @fma_v2_v_fneg(ptr addrspace(1) %a, float %x) { ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-SDAG-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset +; GFX1250-SDAG-NEXT: s_mov_b32 s4, s2 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[2:3] op_sel_hi:[1,0,0] neg_lo:[0,1,1] neg_hi:[0,1,1] +; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[4:5], s[4:5] op_sel_hi:[1,0,0] neg_lo:[0,1,1] neg_hi:[0,1,1] ; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset ; GFX1250-SDAG-NEXT: s_endpgm ; @@ -3017,6 +3060,7 @@ define amdgpu_kernel void @fma_v2_v_fneg(ptr addrspace(1) %a, float %x) { } define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) { +; ; GFX900-LABEL: add_vector_neg_bitcast_scalar_lo: ; GFX900: ; %bb.0: ; %bb ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -3104,6 +3148,7 @@ bb: } define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) { +; ; GFX900-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi: ; GFX900: ; %bb.0: ; %bb ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -3206,6 +3251,7 @@ bb: } define amdgpu_kernel void @shuffle_add_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { +; ; GFX900-LABEL: shuffle_add_f32: ; GFX900: ; %bb.0: ; %bb ; GFX900-NEXT: s_load_dword s0, s[4:5], 0x2c @@ -3284,6 +3330,7 @@ bb: } define amdgpu_kernel void @shuffle_neg_add_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { +; ; GFX900-LABEL: shuffle_neg_add_f32: ; GFX900: ; %bb.0: ; %bb ; GFX900-NEXT: s_load_dword s0, s[4:5], 0x2c @@ -3382,6 +3429,7 @@ bb: } define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) { +; ; GFX900-LABEL: fadd_fadd_fsub_0: ; GFX900: ; %bb.0: ; %bb ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -3405,9 +3453,10 @@ define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) { ; GFX90A-GISEL-LABEL: fadd_fadd_fsub_0: ; GFX90A-GISEL: ; %bb.0: ; %bb ; GFX90A-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX90A-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], 0 -; GFX90A-GISEL-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-GISEL-NEXT: v_pk_add_f32 v[2:3], s[0:1], 0 +; GFX90A-GISEL-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0 ; GFX90A-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-GISEL-NEXT: v_mov_b32_e32 v3, v0 @@ -3417,10 +3466,11 @@ define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) { ; GFX942-GISEL-LABEL: fadd_fadd_fsub_0: ; GFX942-GISEL: ; %bb.0: ; %bb ; GFX942-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], 0 +; GFX942-GISEL-NEXT: v_pk_add_f32 v[2:3], s[0:1], 0 ; GFX942-GISEL-NEXT: s_nop 0 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, v0 @@ -3441,15 +3491,15 @@ define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) { ; GFX1250-GISEL-LABEL: fadd_fadd_fsub_0: ; GFX1250-GISEL: ; %bb.0: ; %bb ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, v1 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v3, v0 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[2:3], 0 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, v0 ; GFX1250-GISEL-NEXT: flat_store_b64 v[0:1], v[2:3] ; GFX1250-GISEL-NEXT: s_endpgm bb: @@ -3463,6 +3513,7 @@ bb: } define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, ptr addrspace(1) %ptr) { +; ; GFX900-LABEL: fadd_fadd_fsub: ; GFX900: ; %bb.0: ; %bb ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -3496,35 +3547,37 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p ; GFX90A-GISEL-LABEL: fadd_fadd_fsub: ; GFX90A-GISEL: ; %bb.0: ; %bb ; GFX90A-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX90A-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], v[0:1] -; GFX90A-GISEL-NEXT: v_sub_f32_e32 v0, s0, v2 -; GFX90A-GISEL-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-GISEL-NEXT: v_pk_add_f32 v[2:3], s[2:3], v[2:3] -; GFX90A-GISEL-NEXT: v_subrev_f32_e32 v1, s3, v2 -; GFX90A-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-GISEL-NEXT: v_pk_add_f32 v[2:3], s[0:1], v[2:3] +; GFX90A-GISEL-NEXT: v_sub_f32_e32 v2, s0, v0 +; GFX90A-GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-GISEL-NEXT: v_pk_add_f32 v[0:1], s[2:3], v[0:1] +; GFX90A-GISEL-NEXT: v_subrev_f32_e32 v3, s3, v0 +; GFX90A-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-GISEL-NEXT: global_store_dwordx2 v0, v[2:3], s[6:7] ; GFX90A-GISEL-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: fadd_fadd_fsub: ; GFX942-GISEL: ; %bb.0: ; %bb ; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX942-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], v[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-GISEL-NEXT: v_pk_add_f32 v[2:3], s[0:1], v[2:3] ; GFX942-GISEL-NEXT: s_nop 0 -; GFX942-GISEL-NEXT: v_sub_f32_e32 v0, s0, v2 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-GISEL-NEXT: v_pk_add_f32 v[2:3], s[2:3], v[2:3] +; GFX942-GISEL-NEXT: v_sub_f32_e32 v2, s0, v0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-GISEL-NEXT: v_pk_add_f32 v[0:1], s[2:3], v[0:1] ; GFX942-GISEL-NEXT: s_nop 0 -; GFX942-GISEL-NEXT: v_subrev_f32_e32 v1, s3, v2 -; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX942-GISEL-NEXT: v_subrev_f32_e32 v3, s3, v0 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-GISEL-NEXT: global_store_dwordx2 v0, v[2:3], s[6:7] ; GFX942-GISEL-NEXT: s_endpgm ; ; GFX1250-SDAG-LABEL: fadd_fadd_fsub: @@ -3549,15 +3602,16 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p ; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_sub_f32 s0, s0, s2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, v1 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[2:3], v[0:1] +; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[2:3], v[4:5] ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_subrev_f32 v3, s3, v0 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-GISEL-NEXT: global_store_b64 v0, v[2:3], s[4:5] @@ -3573,6 +3627,7 @@ bb: } define amdgpu_kernel void @fadd_shuffle_v4(ptr addrspace(1) %arg) { +; ; GFX900-LABEL: fadd_shuffle_v4: ; GFX900: ; %bb.0: ; %bb ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -3651,6 +3706,7 @@ bb: } define amdgpu_kernel void @fneg_v2f32_vec(ptr addrspace(1) %a) { +; ; GFX900-LABEL: fneg_v2f32_vec: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -3717,6 +3773,7 @@ define amdgpu_kernel void @fneg_v2f32_vec(ptr addrspace(1) %a) { } define amdgpu_kernel void @fneg_v2f32_scalar(ptr addrspace(1) %a, <2 x float> %x) { +; ; GFX900-LABEL: fneg_v2f32_scalar: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll index 25609e881254e..0a8aa31dd7548 100644 --- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll +++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll @@ -3357,7 +3357,9 @@ define i64 @v_mul_9_add_52_i64(i64 %arg) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, 9, 52 -; GFX900-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, 9, v[1:2] +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, 9, v[3:4] ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_mul_9_add_52_i64: @@ -3365,6 +3367,7 @@ define i64 @v_mul_9_add_52_i64(i64 %arg) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, 9, 52 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: v_mov_b32_e32 v4, v1 ; GFX90A-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, 9, v[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 @@ -3375,7 +3378,9 @@ define i64 @v_mul_9_add_52_i64(i64 %arg) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v1 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v0, 9, 52 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], null, v2, 9, v[1:2] +; GFX10-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], null, v2, 9, v[3:4] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: v_mul_9_add_52_i64: @@ -3694,7 +3699,9 @@ define i64 @v_mul_5_add_1_i64(i64 %arg) { ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, 5, 1 -; GFX900-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, 5, v[1:2] +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, 5, v[3:4] ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_mul_5_add_1_i64: @@ -3702,6 +3709,7 @@ define i64 @v_mul_5_add_1_i64(i64 %arg) { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, 5, 1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: v_mov_b32_e32 v4, v1 ; GFX90A-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, 5, v[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 @@ -3712,7 +3720,9 @@ define i64 @v_mul_5_add_1_i64(i64 %arg) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v1 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v0, 5, 1 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], null, v2, 5, v[1:2] +; GFX10-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], null, v2, 5, v[3:4] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: v_mul_5_add_1_i64: @@ -3772,7 +3782,9 @@ define i64 @v_mul_284_add_82_i64(i64 %arg) { ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[3:4] -; GFX900-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, s6, v[1:2] +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, s6, v[3:4] ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_mul_284_add_82_i64: @@ -3783,6 +3795,7 @@ define i64 @v_mul_284_add_82_i64(i64 %arg) { ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[4:5] +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: v_mov_b32_e32 v4, v1 ; GFX90A-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, s6, v[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 @@ -3794,7 +3807,9 @@ define i64 @v_mul_284_add_82_i64(i64 %arg) { ; GFX10-NEXT: s_movk_i32 s4, 0x11c ; GFX10-NEXT: v_mov_b32_e32 v2, v1 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v0, s4, 0x52 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], null, 0x11c, v2, v[1:2] +; GFX10-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], null, 0x11c, v2, v[3:4] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: v_mul_284_add_82_i64: @@ -3855,7 +3870,9 @@ define i64 @v_mul_934584645_add_8234599_i64(i64 %arg) { ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v1 ; GFX900-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[3:4] -; GFX900-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, s6, v[1:2] +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, s6, v[3:4] ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_mul_934584645_add_8234599_i64: @@ -3866,6 +3883,7 @@ define i64 @v_mul_934584645_add_8234599_i64(i64 %arg) { ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1 ; GFX90A-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[4:5] +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: v_mov_b32_e32 v4, v1 ; GFX90A-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, s6, v[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 @@ -3877,7 +3895,9 @@ define i64 @v_mul_934584645_add_8234599_i64(i64 %arg) { ; GFX10-NEXT: s_mov_b32 s4, 0x37b4a145 ; GFX10-NEXT: v_mov_b32_e32 v2, v1 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v0, s4, 0x7da667 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], null, 0x37b4a145, v2, v[1:2] +; GFX10-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mad_u64_u32 v[1:2], null, 0x37b4a145, v2, v[3:4] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: v_mul_934584645_add_8234599_i64: diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll index 5f42abbeae253..9f0c130688b3c 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll @@ -499,9 +499,10 @@ define void @test_rewrite_mfma_subreg_insert0(float %arg0, float %arg1, ptr addr ; CHECK-LABEL: test_rewrite_mfma_subreg_insert0: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off +; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off +; CHECK-NEXT: ; implicit-def: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v1, a[0:3] +; CHECK-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v1, a[8:11] ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use a[0:7] ; CHECK-NEXT: ;;#ASMEND @@ -519,6 +520,7 @@ define void @test_rewrite_mfma_subreg_insert1(float %arg0, float %arg1, ptr addr ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off +; CHECK-NEXT: ; implicit-def: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[2:5] ; CHECK-NEXT: s_nop 3 @@ -544,7 +546,11 @@ define void @test_rewrite_mfma_subreg_insert2(double %arg0, double %arg1, ptr ad ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx2 a[0:1], v[4:5], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], v[2:3], a[0:1] +; CHECK-NEXT: v_mfma_f64_4x4x4_4b_f64 a[4:5], v[0:1], v[2:3], a[0:1] +; CHECK-NEXT: ; implicit-def: $agpr0_agpr1_agpr2_agpr3 +; CHECK-NEXT: s_nop 5 +; CHECK-NEXT: v_accvgpr_mov_b32 a0, a4 +; CHECK-NEXT: v_accvgpr_mov_b32 a1, a5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use a[0:3] ; CHECK-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll index 7a3bff8aed56e..256e4bf5f65f0 100644 --- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll @@ -2640,90 +2640,90 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX9-FLATSCR-LABEL: cs_main: ; GFX9-FLATSCR: ; %bb.0: ; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s2 -; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 ; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbf3d349e -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29dc +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:320 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[13:16], s0 offset:240 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v7 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c +; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:288 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbeae29dc +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xb7043519 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe31934f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbe319356 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], s0 offset:224 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf523be3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf5f2ee3 +; GFX9-FLATSCR-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd89f +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0xbefcd8a3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], s0 offset:256 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf638e39 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v19 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0x3f20e7f5 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v16 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, v22 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], s0 offset:304 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:272 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:208 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:192 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v19 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, v6 +; GFX9-FLATSCR-NEXT: scratch_load_dword v14, v2, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0x3f20e7f4 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:768 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf523be1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, 0x3f3d349c +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v7 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v16 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s0 offset:832 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:784 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v6 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v18 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3703c499 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v15 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v18 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:752 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:736 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:816 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v17 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v15 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v13 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v1 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], s0 offset:800 +; GFX9-FLATSCR-NEXT: v_add_u32_e32 v0, 0x200, v0 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], s0 offset:720 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], s0 offset:704 +; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v0, off ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v14, v0 ; GFX9-FLATSCR-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-LABEL: cs_main: @@ -2822,93 +2822,93 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[2:3] ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s2, s0 ; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x10 -; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-PAL-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-FLATSCR-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0 ; GFX9-FLATSCR-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, 0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbf3d349e -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v23 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29dc +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e319356 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0x3e31934f +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:320 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[13:16], s0 offset:240 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v7 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c +; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:288 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbeae29dc +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xb7043519 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe31934f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbe319356 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], s0 offset:224 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0xbf523be3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf5f2ee3 +; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd89f +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0xbefcd8a3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], s0 offset:256 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf638e39 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v19 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0x3f20e7f5 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v16 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, v22 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], s0 offset:304 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:272 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:208 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:192 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v19 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v6 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v14, v2, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0x3f20e7f4 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], s0 offset:768 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf523be1 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0x3f3d349c +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v7 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v16 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[2:5], s0 offset:832 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:784 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, v8 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee2 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v6 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v18 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3703c499 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v15 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v18 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:752 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:736 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:816 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v17 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2 -; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800 -; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720 -; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704 -; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v15 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v13 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v3 +; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v1 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], s0 offset:800 +; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v0, 0x200, v0 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], s0 offset:720 +; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], s0 offset:704 +; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v0, off ; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0 +; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v14, v0 ; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog ; ; GFX10-FLATSCR-PAL-LABEL: cs_main: diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll index 584d26ed41893..35ca9961c8aad 100644 --- a/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll +++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll @@ -8,6 +8,8 @@ define amdgpu_kernel void @v_sext_in_reg_i8_i16_shuffle_vector(ptr addrspace(1) ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b64 v[1:2], v0, s[2:3] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll index 9b3dc7f531021..c2bb7806498cf 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll @@ -57,37 +57,41 @@ define void @v_shuffle_v2f32_v2f32__0_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v2f32_v2f32__1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__1_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v2f32__1_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v2f32__1_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -110,37 +114,41 @@ define void @v_shuffle_v2f32_v2f32__2_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v2f32_v2f32__3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v2f32__3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v2f32__3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -350,33 +358,37 @@ define void @v_shuffle_v2f32_v2f32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v2f32__u_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v2f32__u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -476,33 +488,37 @@ define void @v_shuffle_v2f32_v2f32__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v2f32__2_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v2f32__2_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -723,37 +739,41 @@ define void @v_shuffle_v2f32_v2f32__0_2(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v2f32_v2f32__1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2f32_v2f32__1_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2f32_v2f32__1_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2f32_v2f32__1_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1016,6 +1036,7 @@ define void @s_shuffle_v2f32_v2f32__0_u() { define void @s_shuffle_v2f32_v2f32__1_u() { ; GFX900-LABEL: s_shuffle_v2f32_v2f32__1_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -1028,6 +1049,7 @@ define void @s_shuffle_v2f32_v2f32__1_u() { ; ; GFX90A-LABEL: s_shuffle_v2f32_v2f32__1_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -1040,6 +1062,7 @@ define void @s_shuffle_v2f32_v2f32__1_u() { ; ; GFX942-LABEL: s_shuffle_v2f32_v2f32__1_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -1072,6 +1095,7 @@ define void @s_shuffle_v2f32_v2f32__2_u() { define void @s_shuffle_v2f32_v2f32__3_u() { ; GFX900-LABEL: s_shuffle_v2f32_v2f32__3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -1084,6 +1108,7 @@ define void @s_shuffle_v2f32_v2f32__3_u() { ; ; GFX90A-LABEL: s_shuffle_v2f32_v2f32__3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -1096,6 +1121,7 @@ define void @s_shuffle_v2f32_v2f32__3_u() { ; ; GFX942-LABEL: s_shuffle_v2f32_v2f32__3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -1287,6 +1313,7 @@ define void @s_shuffle_v2f32_v2f32__3_3() { define void @s_shuffle_v2f32_v2f32__u_0() { ; GFX900-LABEL: s_shuffle_v2f32_v2f32__u_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -1299,6 +1326,7 @@ define void @s_shuffle_v2f32_v2f32__u_0() { ; ; GFX90A-LABEL: s_shuffle_v2f32_v2f32__u_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -1311,6 +1339,7 @@ define void @s_shuffle_v2f32_v2f32__u_0() { ; ; GFX942-LABEL: s_shuffle_v2f32_v2f32__u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -1392,6 +1421,7 @@ define void @s_shuffle_v2f32_v2f32__1_0() { define void @s_shuffle_v2f32_v2f32__2_0() { ; GFX900-LABEL: s_shuffle_v2f32_v2f32__2_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -1404,6 +1434,7 @@ define void @s_shuffle_v2f32_v2f32__2_0() { ; ; GFX90A-LABEL: s_shuffle_v2f32_v2f32__2_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -1416,6 +1447,7 @@ define void @s_shuffle_v2f32_v2f32__2_0() { ; ; GFX942-LABEL: s_shuffle_v2f32_v2f32__2_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -1626,6 +1658,7 @@ define void @s_shuffle_v2f32_v2f32__0_2() { define void @s_shuffle_v2f32_v2f32__1_2() { ; GFX900-LABEL: s_shuffle_v2f32_v2f32__1_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -1638,6 +1671,7 @@ define void @s_shuffle_v2f32_v2f32__1_2() { ; ; GFX90A-LABEL: s_shuffle_v2f32_v2f32__1_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -1650,6 +1684,7 @@ define void @s_shuffle_v2f32_v2f32__1_2() { ; ; GFX942-LABEL: s_shuffle_v2f32_v2f32__1_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll index 34043cd067b25..0053e66c94b9c 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll @@ -72,9 +72,10 @@ define void @v_shuffle_v2f32_v3f32__1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -84,9 +85,10 @@ define void @v_shuffle_v2f32_v3f32__1_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -102,6 +104,7 @@ define void @v_shuffle_v2f32_v3f32__2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] @@ -114,6 +117,7 @@ define void @v_shuffle_v2f32_v3f32__2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] @@ -126,6 +130,7 @@ define void @v_shuffle_v2f32_v3f32__2_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] @@ -166,9 +171,10 @@ define void @v_shuffle_v2f32_v3f32__4_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -178,9 +184,10 @@ define void @v_shuffle_v2f32_v3f32__4_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -197,6 +204,7 @@ define void @v_shuffle_v2f32_v3f32__5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] @@ -209,6 +217,7 @@ define void @v_shuffle_v2f32_v3f32__5_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] @@ -221,6 +230,7 @@ define void @v_shuffle_v2f32_v3f32__5_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] @@ -536,9 +546,10 @@ define void @v_shuffle_v2f32_v3f32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -548,9 +559,10 @@ define void @v_shuffle_v2f32_v3f32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -560,9 +572,10 @@ define void @v_shuffle_v2f32_v3f32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -706,9 +719,10 @@ define void @v_shuffle_v2f32_v3f32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -718,9 +732,10 @@ define void @v_shuffle_v2f32_v3f32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -730,9 +745,10 @@ define void @v_shuffle_v2f32_v3f32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1066,6 +1082,7 @@ define void @v_shuffle_v2f32_v3f32__u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] @@ -1078,6 +1095,7 @@ define void @v_shuffle_v2f32_v3f32__u_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] @@ -1236,6 +1254,7 @@ define void @v_shuffle_v2f32_v3f32__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] @@ -1248,6 +1267,7 @@ define void @v_shuffle_v2f32_v3f32__3_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] @@ -1381,9 +1401,10 @@ define void @v_shuffle_v2f32_v3f32__1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1393,9 +1414,10 @@ define void @v_shuffle_v2f32_v3f32__1_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1411,6 +1433,7 @@ define void @v_shuffle_v2f32_v3f32__2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] @@ -1423,6 +1446,7 @@ define void @v_shuffle_v2f32_v3f32__2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] @@ -1435,6 +1459,7 @@ define void @v_shuffle_v2f32_v3f32__2_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] @@ -1800,6 +1825,7 @@ define void @v_shuffle_v2f32_v3f32__u_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] @@ -1812,6 +1838,7 @@ define void @v_shuffle_v2f32_v3f32__u_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] @@ -2129,6 +2156,7 @@ define void @s_shuffle_v2f32_v3f32__0_u() { define void @s_shuffle_v2f32_v3f32__1_u() { ; GFX900-LABEL: s_shuffle_v2f32_v3f32__1_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -2141,6 +2169,7 @@ define void @s_shuffle_v2f32_v3f32__1_u() { ; ; GFX90A-LABEL: s_shuffle_v2f32_v3f32__1_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -2153,6 +2182,7 @@ define void @s_shuffle_v2f32_v3f32__1_u() { ; ; GFX942-LABEL: s_shuffle_v2f32_v3f32__1_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -2171,6 +2201,7 @@ define void @s_shuffle_v2f32_v3f32__1_u() { define void @s_shuffle_v2f32_v3f32__2_u() { ; GFX900-LABEL: s_shuffle_v2f32_v3f32__2_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -2183,6 +2214,7 @@ define void @s_shuffle_v2f32_v3f32__2_u() { ; ; GFX90A-LABEL: s_shuffle_v2f32_v3f32__2_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -2195,6 +2227,7 @@ define void @s_shuffle_v2f32_v3f32__2_u() { ; ; GFX942-LABEL: s_shuffle_v2f32_v3f32__2_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -2227,6 +2260,7 @@ define void @s_shuffle_v2f32_v3f32__3_u() { define void @s_shuffle_v2f32_v3f32__4_u() { ; GFX900-LABEL: s_shuffle_v2f32_v3f32__4_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -2239,6 +2273,7 @@ define void @s_shuffle_v2f32_v3f32__4_u() { ; ; GFX90A-LABEL: s_shuffle_v2f32_v3f32__4_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -2251,6 +2286,7 @@ define void @s_shuffle_v2f32_v3f32__4_u() { ; ; GFX942-LABEL: s_shuffle_v2f32_v3f32__4_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -2270,6 +2306,7 @@ define void @s_shuffle_v2f32_v3f32__4_u() { define void @s_shuffle_v2f32_v3f32__5_u() { ; GFX900-LABEL: s_shuffle_v2f32_v3f32__5_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -2282,6 +2319,7 @@ define void @s_shuffle_v2f32_v3f32__5_u() { ; ; GFX90A-LABEL: s_shuffle_v2f32_v3f32__5_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -2294,6 +2332,7 @@ define void @s_shuffle_v2f32_v3f32__5_u() { ; ; GFX942-LABEL: s_shuffle_v2f32_v3f32__5_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -2586,6 +2625,7 @@ define void @s_shuffle_v2f32_v3f32__5_5() { define void @s_shuffle_v2f32_v3f32__u_0() { ; GFX900-LABEL: s_shuffle_v2f32_v3f32__u_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -2598,6 +2638,7 @@ define void @s_shuffle_v2f32_v3f32__u_0() { ; ; GFX90A-LABEL: s_shuffle_v2f32_v3f32__u_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -2610,6 +2651,7 @@ define void @s_shuffle_v2f32_v3f32__u_0() { ; ; GFX942-LABEL: s_shuffle_v2f32_v3f32__u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -2736,6 +2778,7 @@ define void @s_shuffle_v2f32_v3f32__2_0() { define void @s_shuffle_v2f32_v3f32__3_0() { ; GFX900-LABEL: s_shuffle_v2f32_v3f32__3_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -2748,6 +2791,7 @@ define void @s_shuffle_v2f32_v3f32__3_0() { ; ; GFX90A-LABEL: s_shuffle_v2f32_v3f32__3_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -2760,6 +2804,7 @@ define void @s_shuffle_v2f32_v3f32__3_0() { ; ; GFX942-LABEL: s_shuffle_v2f32_v3f32__3_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -3041,6 +3086,7 @@ define void @s_shuffle_v2f32_v3f32__4_1() { define void @s_shuffle_v2f32_v3f32__u_2() { ; GFX900-LABEL: s_shuffle_v2f32_v3f32__u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -3053,6 +3099,7 @@ define void @s_shuffle_v2f32_v3f32__u_2() { ; ; GFX90A-LABEL: s_shuffle_v2f32_v3f32__u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -3065,6 +3112,7 @@ define void @s_shuffle_v2f32_v3f32__u_2() { ; ; GFX942-LABEL: s_shuffle_v2f32_v3f32__u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -3191,6 +3239,7 @@ define void @s_shuffle_v2f32_v3f32__2_2() { define void @s_shuffle_v2f32_v3f32__3_2() { ; GFX900-LABEL: s_shuffle_v2f32_v3f32__3_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -3203,6 +3252,7 @@ define void @s_shuffle_v2f32_v3f32__3_2() { ; ; GFX90A-LABEL: s_shuffle_v2f32_v3f32__3_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -3215,6 +3265,7 @@ define void @s_shuffle_v2f32_v3f32__3_2() { ; ; GFX942-LABEL: s_shuffle_v2f32_v3f32__3_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -3342,6 +3393,7 @@ define void @s_shuffle_v2f32_v3f32__0_3() { define void @s_shuffle_v2f32_v3f32__1_3() { ; GFX900-LABEL: s_shuffle_v2f32_v3f32__1_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -3354,6 +3406,7 @@ define void @s_shuffle_v2f32_v3f32__1_3() { ; ; GFX90A-LABEL: s_shuffle_v2f32_v3f32__1_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -3366,6 +3419,7 @@ define void @s_shuffle_v2f32_v3f32__1_3() { ; ; GFX942-LABEL: s_shuffle_v2f32_v3f32__1_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -3384,6 +3438,7 @@ define void @s_shuffle_v2f32_v3f32__1_3() { define void @s_shuffle_v2f32_v3f32__2_3() { ; GFX900-LABEL: s_shuffle_v2f32_v3f32__2_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -3396,6 +3451,7 @@ define void @s_shuffle_v2f32_v3f32__2_3() { ; ; GFX90A-LABEL: s_shuffle_v2f32_v3f32__2_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -3408,6 +3464,7 @@ define void @s_shuffle_v2f32_v3f32__2_3() { ; ; GFX942-LABEL: s_shuffle_v2f32_v3f32__2_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -3743,6 +3800,7 @@ define void @s_shuffle_v2f32_v3f32__4_4() { define void @s_shuffle_v2f32_v3f32__u_5() { ; GFX900-LABEL: s_shuffle_v2f32_v3f32__u_5: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -3755,6 +3813,7 @@ define void @s_shuffle_v2f32_v3f32__u_5() { ; ; GFX90A-LABEL: s_shuffle_v2f32_v3f32__u_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -3767,6 +3826,7 @@ define void @s_shuffle_v2f32_v3f32__u_5() { ; ; GFX942-LABEL: s_shuffle_v2f32_v3f32__u_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v4f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v4f32.ll index 07ca294019341..8c507189f18e1 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v4f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v4f32.ll @@ -72,9 +72,10 @@ define void @v_shuffle_v2f32_v4f32__1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -84,9 +85,10 @@ define void @v_shuffle_v2f32_v4f32__1_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -141,6 +143,7 @@ define void @v_shuffle_v2f32_v4f32__3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v3 ; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] @@ -153,6 +156,7 @@ define void @v_shuffle_v2f32_v4f32__3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] @@ -165,6 +169,7 @@ define void @v_shuffle_v2f32_v4f32__3_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -205,9 +210,10 @@ define void @v_shuffle_v2f32_v4f32__5_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -217,9 +223,10 @@ define void @v_shuffle_v2f32_v4f32__5_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -276,6 +283,7 @@ define void @v_shuffle_v2f32_v4f32__7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v3 ; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] @@ -288,6 +296,7 @@ define void @v_shuffle_v2f32_v4f32__7_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] @@ -300,6 +309,7 @@ define void @v_shuffle_v2f32_v4f32__7_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -707,9 +717,10 @@ define void @v_shuffle_v2f32_v4f32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -719,9 +730,10 @@ define void @v_shuffle_v2f32_v4f32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -731,9 +743,10 @@ define void @v_shuffle_v2f32_v4f32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -920,9 +933,10 @@ define void @v_shuffle_v2f32_v4f32__4_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -932,9 +946,10 @@ define void @v_shuffle_v2f32_v4f32__4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -944,9 +959,10 @@ define void @v_shuffle_v2f32_v4f32__4_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1428,6 +1444,7 @@ define void @v_shuffle_v2f32_v4f32__u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] @@ -1440,6 +1457,7 @@ define void @v_shuffle_v2f32_v4f32__u_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -1638,6 +1656,7 @@ define void @v_shuffle_v2f32_v4f32__4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] @@ -1650,6 +1669,7 @@ define void @v_shuffle_v2f32_v4f32__4_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -2184,9 +2204,10 @@ define void @v_shuffle_v2f32_v4f32__1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2196,9 +2217,10 @@ define void @v_shuffle_v2f32_v4f32__1_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2253,6 +2275,7 @@ define void @v_shuffle_v2f32_v4f32__3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v3 ; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] @@ -2265,6 +2288,7 @@ define void @v_shuffle_v2f32_v4f32__3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] @@ -2277,6 +2301,7 @@ define void @v_shuffle_v2f32_v4f32__3_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -2781,6 +2806,7 @@ define void @v_shuffle_v2f32_v4f32__u_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] @@ -2793,6 +2819,7 @@ define void @v_shuffle_v2f32_v4f32__u_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -3582,6 +3609,7 @@ define void @s_shuffle_v2f32_v4f32__0_u() { define void @s_shuffle_v2f32_v4f32__1_u() { ; GFX900-LABEL: s_shuffle_v2f32_v4f32__1_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -3594,6 +3622,7 @@ define void @s_shuffle_v2f32_v4f32__1_u() { ; ; GFX90A-LABEL: s_shuffle_v2f32_v4f32__1_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -3606,6 +3635,7 @@ define void @s_shuffle_v2f32_v4f32__1_u() { ; ; GFX942-LABEL: s_shuffle_v2f32_v4f32__1_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -3666,6 +3696,7 @@ define void @s_shuffle_v2f32_v4f32__2_u() { define void @s_shuffle_v2f32_v4f32__3_u() { ; GFX900-LABEL: s_shuffle_v2f32_v4f32__3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -3678,6 +3709,7 @@ define void @s_shuffle_v2f32_v4f32__3_u() { ; ; GFX90A-LABEL: s_shuffle_v2f32_v4f32__3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -3690,6 +3722,7 @@ define void @s_shuffle_v2f32_v4f32__3_u() { ; ; GFX942-LABEL: s_shuffle_v2f32_v4f32__3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -3722,6 +3755,7 @@ define void @s_shuffle_v2f32_v4f32__4_u() { define void @s_shuffle_v2f32_v4f32__5_u() { ; GFX900-LABEL: s_shuffle_v2f32_v4f32__5_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -3734,6 +3768,7 @@ define void @s_shuffle_v2f32_v4f32__5_u() { ; ; GFX90A-LABEL: s_shuffle_v2f32_v4f32__5_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -3746,6 +3781,7 @@ define void @s_shuffle_v2f32_v4f32__5_u() { ; ; GFX942-LABEL: s_shuffle_v2f32_v4f32__5_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -3808,6 +3844,7 @@ define void @s_shuffle_v2f32_v4f32__6_u() { define void @s_shuffle_v2f32_v4f32__7_u() { ; GFX900-LABEL: s_shuffle_v2f32_v4f32__7_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -3820,6 +3857,7 @@ define void @s_shuffle_v2f32_v4f32__7_u() { ; ; GFX90A-LABEL: s_shuffle_v2f32_v4f32__7_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -3832,6 +3870,7 @@ define void @s_shuffle_v2f32_v4f32__7_u() { ; ; GFX942-LABEL: s_shuffle_v2f32_v4f32__7_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -4225,6 +4264,7 @@ define void @s_shuffle_v2f32_v4f32__7_7() { define void @s_shuffle_v2f32_v4f32__u_0() { ; GFX900-LABEL: s_shuffle_v2f32_v4f32__u_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -4237,6 +4277,7 @@ define void @s_shuffle_v2f32_v4f32__u_0() { ; ; GFX90A-LABEL: s_shuffle_v2f32_v4f32__u_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -4249,6 +4290,7 @@ define void @s_shuffle_v2f32_v4f32__u_0() { ; ; GFX942-LABEL: s_shuffle_v2f32_v4f32__u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -4420,6 +4462,7 @@ define void @s_shuffle_v2f32_v4f32__3_0() { define void @s_shuffle_v2f32_v4f32__4_0() { ; GFX900-LABEL: s_shuffle_v2f32_v4f32__4_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -4432,6 +4475,7 @@ define void @s_shuffle_v2f32_v4f32__4_0() { ; ; GFX90A-LABEL: s_shuffle_v2f32_v4f32__4_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -4444,6 +4488,7 @@ define void @s_shuffle_v2f32_v4f32__4_0() { ; ; GFX942-LABEL: s_shuffle_v2f32_v4f32__4_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -4880,6 +4925,7 @@ define void @s_shuffle_v2f32_v4f32__6_1() { define void @s_shuffle_v2f32_v4f32__u_2() { ; GFX900-LABEL: s_shuffle_v2f32_v4f32__u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -4892,6 +4938,7 @@ define void @s_shuffle_v2f32_v4f32__u_2() { ; ; GFX90A-LABEL: s_shuffle_v2f32_v4f32__u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -4904,6 +4951,7 @@ define void @s_shuffle_v2f32_v4f32__u_2() { ; ; GFX942-LABEL: s_shuffle_v2f32_v4f32__u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -5075,6 +5123,7 @@ define void @s_shuffle_v2f32_v4f32__3_2() { define void @s_shuffle_v2f32_v4f32__4_2() { ; GFX900-LABEL: s_shuffle_v2f32_v4f32__4_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -5087,6 +5136,7 @@ define void @s_shuffle_v2f32_v4f32__4_2() { ; ; GFX90A-LABEL: s_shuffle_v2f32_v4f32__4_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -5099,6 +5149,7 @@ define void @s_shuffle_v2f32_v4f32__4_2() { ; ; GFX942-LABEL: s_shuffle_v2f32_v4f32__4_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -5625,6 +5676,7 @@ define void @s_shuffle_v2f32_v4f32__0_4() { define void @s_shuffle_v2f32_v4f32__1_4() { ; GFX900-LABEL: s_shuffle_v2f32_v4f32__1_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -5637,6 +5689,7 @@ define void @s_shuffle_v2f32_v4f32__1_4() { ; ; GFX90A-LABEL: s_shuffle_v2f32_v4f32__1_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -5649,6 +5702,7 @@ define void @s_shuffle_v2f32_v4f32__1_4() { ; ; GFX942-LABEL: s_shuffle_v2f32_v4f32__1_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -5709,6 +5763,7 @@ define void @s_shuffle_v2f32_v4f32__2_4() { define void @s_shuffle_v2f32_v4f32__3_4() { ; GFX900-LABEL: s_shuffle_v2f32_v4f32__3_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -5721,6 +5776,7 @@ define void @s_shuffle_v2f32_v4f32__3_4() { ; ; GFX90A-LABEL: s_shuffle_v2f32_v4f32__3_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -5733,6 +5789,7 @@ define void @s_shuffle_v2f32_v4f32__3_4() { ; ; GFX942-LABEL: s_shuffle_v2f32_v4f32__3_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -6215,6 +6272,7 @@ define void @s_shuffle_v2f32_v4f32__6_5() { define void @s_shuffle_v2f32_v4f32__u_6() { ; GFX900-LABEL: s_shuffle_v2f32_v4f32__u_6: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -6227,6 +6285,7 @@ define void @s_shuffle_v2f32_v4f32__u_6() { ; ; GFX90A-LABEL: s_shuffle_v2f32_v4f32__u_6: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -6239,6 +6298,7 @@ define void @s_shuffle_v2f32_v4f32__u_6() { ; ; GFX942-LABEL: s_shuffle_v2f32_v4f32__u_6: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v8f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v8f32.ll index 3deb23ca5314b..f8492370a55bf 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v8f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v8f32.ll @@ -72,9 +72,10 @@ define void @v_shuffle_v2f32_v8f32__1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -84,9 +85,10 @@ define void @v_shuffle_v2f32_v8f32__1_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x float> asm "; def $0", "=v"() @@ -152,6 +154,7 @@ define void @v_shuffle_v2f32_v8f32__3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -164,6 +167,7 @@ define void @v_shuffle_v2f32_v8f32__3_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -232,6 +236,7 @@ define void @v_shuffle_v2f32_v8f32__5_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v5 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -244,6 +249,7 @@ define void @v_shuffle_v2f32_v8f32__5_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v5 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -301,6 +307,7 @@ define void @v_shuffle_v2f32_v8f32__7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v7 ; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -313,6 +320,7 @@ define void @v_shuffle_v2f32_v8f32__7_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v7 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -325,6 +333,7 @@ define void @v_shuffle_v2f32_v8f32__7_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v7 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -365,9 +374,10 @@ define void @v_shuffle_v2f32_v8f32__9_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -377,9 +387,10 @@ define void @v_shuffle_v2f32_v8f32__9_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x float> asm "; def $0", "=v"() @@ -447,6 +458,7 @@ define void @v_shuffle_v2f32_v8f32__11_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -459,6 +471,7 @@ define void @v_shuffle_v2f32_v8f32__11_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -529,6 +542,7 @@ define void @v_shuffle_v2f32_v8f32__13_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v5 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -541,6 +555,7 @@ define void @v_shuffle_v2f32_v8f32__13_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v5 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -600,6 +615,7 @@ define void @v_shuffle_v2f32_v8f32__15_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v7 ; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -612,6 +628,7 @@ define void @v_shuffle_v2f32_v8f32__15_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v7 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -624,6 +641,7 @@ define void @v_shuffle_v2f32_v8f32__15_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v7 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -1419,9 +1437,10 @@ define void @v_shuffle_v2f32_v8f32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1431,9 +1450,10 @@ define void @v_shuffle_v2f32_v8f32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1443,9 +1463,10 @@ define void @v_shuffle_v2f32_v8f32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x float> asm "; def $0", "=v"() @@ -1804,9 +1825,10 @@ define void @v_shuffle_v2f32_v8f32__8_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1816,9 +1838,10 @@ define void @v_shuffle_v2f32_v8f32__8_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1828,9 +1851,10 @@ define void @v_shuffle_v2f32_v8f32__8_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x float> asm "; def $0", "=v"() @@ -2904,6 +2928,7 @@ define void @v_shuffle_v2f32_v8f32__u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -2916,6 +2941,7 @@ define void @v_shuffle_v2f32_v8f32__u_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -3286,6 +3312,7 @@ define void @v_shuffle_v2f32_v8f32__8_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -3298,6 +3325,7 @@ define void @v_shuffle_v2f32_v8f32__8_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -4374,6 +4402,7 @@ define void @v_shuffle_v2f32_v8f32__u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -4386,6 +4415,7 @@ define void @v_shuffle_v2f32_v8f32__u_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -4756,6 +4786,7 @@ define void @v_shuffle_v2f32_v8f32__8_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -4768,6 +4799,7 @@ define void @v_shuffle_v2f32_v8f32__8_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -5844,6 +5876,7 @@ define void @v_shuffle_v2f32_v8f32__u_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -5856,6 +5889,7 @@ define void @v_shuffle_v2f32_v8f32__u_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -6226,6 +6260,7 @@ define void @v_shuffle_v2f32_v8f32__8_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -6238,6 +6273,7 @@ define void @v_shuffle_v2f32_v8f32__8_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -7364,9 +7400,10 @@ define void @v_shuffle_v2f32_v8f32__1_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7376,9 +7413,10 @@ define void @v_shuffle_v2f32_v8f32__1_8(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x float> asm "; def $0", "=v"() @@ -7444,6 +7482,7 @@ define void @v_shuffle_v2f32_v8f32__3_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -7456,6 +7495,7 @@ define void @v_shuffle_v2f32_v8f32__3_8(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -7524,6 +7564,7 @@ define void @v_shuffle_v2f32_v8f32__5_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v5 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -7536,6 +7577,7 @@ define void @v_shuffle_v2f32_v8f32__5_8(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v5 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -7593,6 +7635,7 @@ define void @v_shuffle_v2f32_v8f32__7_8(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v7 ; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -7605,6 +7648,7 @@ define void @v_shuffle_v2f32_v8f32__7_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v7 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -7617,6 +7661,7 @@ define void @v_shuffle_v2f32_v8f32__7_8(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v7 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -8681,6 +8726,7 @@ define void @v_shuffle_v2f32_v8f32__u_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -8693,6 +8739,7 @@ define void @v_shuffle_v2f32_v8f32__u_10(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -10213,6 +10260,7 @@ define void @v_shuffle_v2f32_v8f32__u_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -10225,6 +10273,7 @@ define void @v_shuffle_v2f32_v8f32__u_12(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -11745,6 +11794,7 @@ define void @v_shuffle_v2f32_v8f32__u_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -11757,6 +11807,7 @@ define void @v_shuffle_v2f32_v8f32__u_14(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -13322,6 +13373,7 @@ define void @s_shuffle_v2f32_v8f32__1_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s8, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -13334,6 +13386,7 @@ define void @s_shuffle_v2f32_v8f32__1_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s8, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -13342,6 +13395,7 @@ define void @s_shuffle_v2f32_v8f32__1_u() { ; ; GFX942-LABEL: s_shuffle_v2f32_v8f32__1_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13406,6 +13460,7 @@ define void @s_shuffle_v2f32_v8f32__3_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s8, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -13418,6 +13473,7 @@ define void @s_shuffle_v2f32_v8f32__3_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s8, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -13426,6 +13482,7 @@ define void @s_shuffle_v2f32_v8f32__3_u() { ; ; GFX942-LABEL: s_shuffle_v2f32_v8f32__3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13486,9 +13543,10 @@ define void @s_shuffle_v2f32_v8f32__5_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s9 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX900-NEXT: s_mov_b32 s8, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -13498,9 +13556,10 @@ define void @s_shuffle_v2f32_v8f32__5_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s9 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX90A-NEXT: s_mov_b32 s8, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -13508,6 +13567,7 @@ define void @s_shuffle_v2f32_v8f32__5_u() { ; ; GFX942-LABEL: s_shuffle_v2f32_v8f32__5_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13572,6 +13632,7 @@ define void @s_shuffle_v2f32_v8f32__7_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s8, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -13584,6 +13645,7 @@ define void @s_shuffle_v2f32_v8f32__7_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s8, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -13592,6 +13654,7 @@ define void @s_shuffle_v2f32_v8f32__7_u() { ; ; GFX942-LABEL: s_shuffle_v2f32_v8f32__7_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13628,6 +13691,7 @@ define void @s_shuffle_v2f32_v8f32__9_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s8, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -13640,6 +13704,7 @@ define void @s_shuffle_v2f32_v8f32__9_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s8, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -13648,6 +13713,7 @@ define void @s_shuffle_v2f32_v8f32__9_u() { ; ; GFX942-LABEL: s_shuffle_v2f32_v8f32__9_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13714,6 +13780,7 @@ define void @s_shuffle_v2f32_v8f32__11_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s8, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -13726,6 +13793,7 @@ define void @s_shuffle_v2f32_v8f32__11_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s8, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -13734,6 +13802,7 @@ define void @s_shuffle_v2f32_v8f32__11_u() { ; ; GFX942-LABEL: s_shuffle_v2f32_v8f32__11_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13796,9 +13865,10 @@ define void @s_shuffle_v2f32_v8f32__13_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s9 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX900-NEXT: s_mov_b32 s8, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -13808,9 +13878,10 @@ define void @s_shuffle_v2f32_v8f32__13_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s9 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX90A-NEXT: s_mov_b32 s8, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -13818,6 +13889,7 @@ define void @s_shuffle_v2f32_v8f32__13_u() { ; ; GFX942-LABEL: s_shuffle_v2f32_v8f32__13_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13884,6 +13956,7 @@ define void @s_shuffle_v2f32_v8f32__15_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s8, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -13896,6 +13969,7 @@ define void @s_shuffle_v2f32_v8f32__15_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s8, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -13904,6 +13978,7 @@ define void @s_shuffle_v2f32_v8f32__15_u() { ; ; GFX942-LABEL: s_shuffle_v2f32_v8f32__15_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -14679,6 +14754,7 @@ define void @s_shuffle_v2f32_v8f32__u_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s9, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -14691,6 +14767,7 @@ define void @s_shuffle_v2f32_v8f32__u_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s9, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -14699,6 +14776,7 @@ define void @s_shuffle_v2f32_v8f32__u_0() { ; ; GFX942-LABEL: s_shuffle_v2f32_v8f32__u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -15027,6 +15105,7 @@ define void @s_shuffle_v2f32_v8f32__8_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s9, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -15039,6 +15118,7 @@ define void @s_shuffle_v2f32_v8f32__8_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s9, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -15047,6 +15127,7 @@ define void @s_shuffle_v2f32_v8f32__8_0() { ; ; GFX942-LABEL: s_shuffle_v2f32_v8f32__8_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -16023,6 +16104,7 @@ define void @s_shuffle_v2f32_v8f32__u_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s9, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -16035,6 +16117,7 @@ define void @s_shuffle_v2f32_v8f32__u_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s9, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -16043,6 +16126,7 @@ define void @s_shuffle_v2f32_v8f32__u_2() { ; ; GFX942-LABEL: s_shuffle_v2f32_v8f32__u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -16371,6 +16455,7 @@ define void @s_shuffle_v2f32_v8f32__8_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s9, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -16383,6 +16468,7 @@ define void @s_shuffle_v2f32_v8f32__8_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s9, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -16391,6 +16477,7 @@ define void @s_shuffle_v2f32_v8f32__8_2() { ; ; GFX942-LABEL: s_shuffle_v2f32_v8f32__8_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -17464,9 +17551,10 @@ define void @s_shuffle_v2f32_v8f32__u_4() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s8 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX900-NEXT: s_mov_b32 s9, s12 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -17476,9 +17564,10 @@ define void @s_shuffle_v2f32_v8f32__u_4() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s8 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX90A-NEXT: s_mov_b32 s9, s12 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -17486,6 +17575,7 @@ define void @s_shuffle_v2f32_v8f32__u_4() { ; ; GFX942-LABEL: s_shuffle_v2f32_v8f32__u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -17812,9 +17902,10 @@ define void @s_shuffle_v2f32_v8f32__8_4() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s8 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX900-NEXT: s_mov_b32 s9, s12 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -17824,9 +17915,10 @@ define void @s_shuffle_v2f32_v8f32__8_4() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s8 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX90A-NEXT: s_mov_b32 s9, s12 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -17834,6 +17926,7 @@ define void @s_shuffle_v2f32_v8f32__8_4() { ; ; GFX942-LABEL: s_shuffle_v2f32_v8f32__8_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -18806,6 +18899,7 @@ define void @s_shuffle_v2f32_v8f32__u_6() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s9, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -18818,6 +18912,7 @@ define void @s_shuffle_v2f32_v8f32__u_6() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s9, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -18826,6 +18921,7 @@ define void @s_shuffle_v2f32_v8f32__u_6() { ; ; GFX942-LABEL: s_shuffle_v2f32_v8f32__u_6: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -19154,6 +19250,7 @@ define void @s_shuffle_v2f32_v8f32__8_6() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s9, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -19166,6 +19263,7 @@ define void @s_shuffle_v2f32_v8f32__8_6() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s9, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -19174,6 +19272,7 @@ define void @s_shuffle_v2f32_v8f32__8_6() { ; ; GFX942-LABEL: s_shuffle_v2f32_v8f32__8_6: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -20293,6 +20392,7 @@ define void @s_shuffle_v2f32_v8f32__1_8() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s8, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -20305,6 +20405,7 @@ define void @s_shuffle_v2f32_v8f32__1_8() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s8, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -20313,6 +20414,7 @@ define void @s_shuffle_v2f32_v8f32__1_8() { ; ; GFX942-LABEL: s_shuffle_v2f32_v8f32__1_8: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -20377,6 +20479,7 @@ define void @s_shuffle_v2f32_v8f32__3_8() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s8, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -20389,6 +20492,7 @@ define void @s_shuffle_v2f32_v8f32__3_8() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s8, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -20397,6 +20501,7 @@ define void @s_shuffle_v2f32_v8f32__3_8() { ; ; GFX942-LABEL: s_shuffle_v2f32_v8f32__3_8: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -20457,9 +20562,10 @@ define void @s_shuffle_v2f32_v8f32__5_8() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s9 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX900-NEXT: s_mov_b32 s8, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -20469,9 +20575,10 @@ define void @s_shuffle_v2f32_v8f32__5_8() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s9 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX90A-NEXT: s_mov_b32 s8, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -20479,6 +20586,7 @@ define void @s_shuffle_v2f32_v8f32__5_8() { ; ; GFX942-LABEL: s_shuffle_v2f32_v8f32__5_8: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -20543,6 +20651,7 @@ define void @s_shuffle_v2f32_v8f32__7_8() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s8, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -20555,6 +20664,7 @@ define void @s_shuffle_v2f32_v8f32__7_8() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s8, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -20563,6 +20673,7 @@ define void @s_shuffle_v2f32_v8f32__7_8() { ; ; GFX942-LABEL: s_shuffle_v2f32_v8f32__7_8: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -21522,6 +21633,7 @@ define void @s_shuffle_v2f32_v8f32__u_10() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s9, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -21534,6 +21646,7 @@ define void @s_shuffle_v2f32_v8f32__u_10() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s9, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -21542,6 +21655,7 @@ define void @s_shuffle_v2f32_v8f32__u_10() { ; ; GFX942-LABEL: s_shuffle_v2f32_v8f32__u_10: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -23015,9 +23129,10 @@ define void @s_shuffle_v2f32_v8f32__u_12() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s8 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX900-NEXT: s_mov_b32 s9, s12 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -23027,9 +23142,10 @@ define void @s_shuffle_v2f32_v8f32__u_12() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s8 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX90A-NEXT: s_mov_b32 s9, s12 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -23037,6 +23153,7 @@ define void @s_shuffle_v2f32_v8f32__u_12() { ; ; GFX942-LABEL: s_shuffle_v2f32_v8f32__u_12: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -24442,6 +24559,7 @@ define void @s_shuffle_v2f32_v8f32__u_14() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s9, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -24454,6 +24572,7 @@ define void @s_shuffle_v2f32_v8f32__u_14() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s9, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -24462,6 +24581,7 @@ define void @s_shuffle_v2f32_v8f32__u_14() { ; ; GFX942-LABEL: s_shuffle_v2f32_v8f32__u_14: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll index 676a521757bd8..4b4f36f60b30a 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll @@ -57,37 +57,41 @@ define void @v_shuffle_v2i32_v2i32__0_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v2i32_v2i32__1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__1_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v2i32__1_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v2i32__1_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -110,37 +114,41 @@ define void @v_shuffle_v2i32_v2i32__2_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v2i32_v2i32__3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v2i32__3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v2i32__3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -350,33 +358,37 @@ define void @v_shuffle_v2i32_v2i32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v2i32__u_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v2i32__u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -476,33 +488,37 @@ define void @v_shuffle_v2i32_v2i32__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v2i32__2_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v2i32__2_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -723,37 +739,41 @@ define void @v_shuffle_v2i32_v2i32__0_2(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v2i32_v2i32__1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i32_v2i32__1_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i32_v2i32__1_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i32_v2i32__1_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -1016,6 +1036,7 @@ define void @s_shuffle_v2i32_v2i32__0_u() { define void @s_shuffle_v2i32_v2i32__1_u() { ; GFX900-LABEL: s_shuffle_v2i32_v2i32__1_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -1028,6 +1049,7 @@ define void @s_shuffle_v2i32_v2i32__1_u() { ; ; GFX90A-LABEL: s_shuffle_v2i32_v2i32__1_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -1040,6 +1062,7 @@ define void @s_shuffle_v2i32_v2i32__1_u() { ; ; GFX942-LABEL: s_shuffle_v2i32_v2i32__1_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -1072,6 +1095,7 @@ define void @s_shuffle_v2i32_v2i32__2_u() { define void @s_shuffle_v2i32_v2i32__3_u() { ; GFX900-LABEL: s_shuffle_v2i32_v2i32__3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -1084,6 +1108,7 @@ define void @s_shuffle_v2i32_v2i32__3_u() { ; ; GFX90A-LABEL: s_shuffle_v2i32_v2i32__3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -1096,6 +1121,7 @@ define void @s_shuffle_v2i32_v2i32__3_u() { ; ; GFX942-LABEL: s_shuffle_v2i32_v2i32__3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -1287,6 +1313,7 @@ define void @s_shuffle_v2i32_v2i32__3_3() { define void @s_shuffle_v2i32_v2i32__u_0() { ; GFX900-LABEL: s_shuffle_v2i32_v2i32__u_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -1299,6 +1326,7 @@ define void @s_shuffle_v2i32_v2i32__u_0() { ; ; GFX90A-LABEL: s_shuffle_v2i32_v2i32__u_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -1311,6 +1339,7 @@ define void @s_shuffle_v2i32_v2i32__u_0() { ; ; GFX942-LABEL: s_shuffle_v2i32_v2i32__u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -1392,6 +1421,7 @@ define void @s_shuffle_v2i32_v2i32__1_0() { define void @s_shuffle_v2i32_v2i32__2_0() { ; GFX900-LABEL: s_shuffle_v2i32_v2i32__2_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -1404,6 +1434,7 @@ define void @s_shuffle_v2i32_v2i32__2_0() { ; ; GFX90A-LABEL: s_shuffle_v2i32_v2i32__2_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -1416,6 +1447,7 @@ define void @s_shuffle_v2i32_v2i32__2_0() { ; ; GFX942-LABEL: s_shuffle_v2i32_v2i32__2_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -1626,6 +1658,7 @@ define void @s_shuffle_v2i32_v2i32__0_2() { define void @s_shuffle_v2i32_v2i32__1_2() { ; GFX900-LABEL: s_shuffle_v2i32_v2i32__1_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -1638,6 +1671,7 @@ define void @s_shuffle_v2i32_v2i32__1_2() { ; ; GFX90A-LABEL: s_shuffle_v2i32_v2i32__1_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -1650,6 +1684,7 @@ define void @s_shuffle_v2i32_v2i32__1_2() { ; ; GFX942-LABEL: s_shuffle_v2i32_v2i32__1_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll index f65340470feb1..e6e1b6b67bcb6 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll @@ -72,9 +72,10 @@ define void @v_shuffle_v2i32_v3i32__1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -84,9 +85,10 @@ define void @v_shuffle_v2i32_v3i32__1_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -102,6 +104,7 @@ define void @v_shuffle_v2i32_v3i32__2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] @@ -114,6 +117,7 @@ define void @v_shuffle_v2i32_v3i32__2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] @@ -126,6 +130,7 @@ define void @v_shuffle_v2i32_v3i32__2_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] @@ -166,9 +171,10 @@ define void @v_shuffle_v2i32_v3i32__4_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -178,9 +184,10 @@ define void @v_shuffle_v2i32_v3i32__4_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -197,6 +204,7 @@ define void @v_shuffle_v2i32_v3i32__5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] @@ -209,6 +217,7 @@ define void @v_shuffle_v2i32_v3i32__5_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] @@ -221,6 +230,7 @@ define void @v_shuffle_v2i32_v3i32__5_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] @@ -536,9 +546,10 @@ define void @v_shuffle_v2i32_v3i32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -548,9 +559,10 @@ define void @v_shuffle_v2i32_v3i32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -560,9 +572,10 @@ define void @v_shuffle_v2i32_v3i32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -706,9 +719,10 @@ define void @v_shuffle_v2i32_v3i32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -718,9 +732,10 @@ define void @v_shuffle_v2i32_v3i32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -730,9 +745,10 @@ define void @v_shuffle_v2i32_v3i32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1066,6 +1082,7 @@ define void @v_shuffle_v2i32_v3i32__u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] @@ -1078,6 +1095,7 @@ define void @v_shuffle_v2i32_v3i32__u_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] @@ -1236,6 +1254,7 @@ define void @v_shuffle_v2i32_v3i32__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] @@ -1248,6 +1267,7 @@ define void @v_shuffle_v2i32_v3i32__3_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] @@ -1381,9 +1401,10 @@ define void @v_shuffle_v2i32_v3i32__1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1393,9 +1414,10 @@ define void @v_shuffle_v2i32_v3i32__1_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1411,6 +1433,7 @@ define void @v_shuffle_v2i32_v3i32__2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] @@ -1423,6 +1446,7 @@ define void @v_shuffle_v2i32_v3i32__2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] @@ -1435,6 +1459,7 @@ define void @v_shuffle_v2i32_v3i32__2_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] @@ -1800,6 +1825,7 @@ define void @v_shuffle_v2i32_v3i32__u_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] @@ -1812,6 +1838,7 @@ define void @v_shuffle_v2i32_v3i32__u_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] @@ -2129,6 +2156,7 @@ define void @s_shuffle_v2i32_v3i32__0_u() { define void @s_shuffle_v2i32_v3i32__1_u() { ; GFX900-LABEL: s_shuffle_v2i32_v3i32__1_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -2141,6 +2169,7 @@ define void @s_shuffle_v2i32_v3i32__1_u() { ; ; GFX90A-LABEL: s_shuffle_v2i32_v3i32__1_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -2153,6 +2182,7 @@ define void @s_shuffle_v2i32_v3i32__1_u() { ; ; GFX942-LABEL: s_shuffle_v2i32_v3i32__1_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -2171,6 +2201,7 @@ define void @s_shuffle_v2i32_v3i32__1_u() { define void @s_shuffle_v2i32_v3i32__2_u() { ; GFX900-LABEL: s_shuffle_v2i32_v3i32__2_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -2183,6 +2214,7 @@ define void @s_shuffle_v2i32_v3i32__2_u() { ; ; GFX90A-LABEL: s_shuffle_v2i32_v3i32__2_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -2195,6 +2227,7 @@ define void @s_shuffle_v2i32_v3i32__2_u() { ; ; GFX942-LABEL: s_shuffle_v2i32_v3i32__2_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -2227,6 +2260,7 @@ define void @s_shuffle_v2i32_v3i32__3_u() { define void @s_shuffle_v2i32_v3i32__4_u() { ; GFX900-LABEL: s_shuffle_v2i32_v3i32__4_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -2239,6 +2273,7 @@ define void @s_shuffle_v2i32_v3i32__4_u() { ; ; GFX90A-LABEL: s_shuffle_v2i32_v3i32__4_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -2251,6 +2286,7 @@ define void @s_shuffle_v2i32_v3i32__4_u() { ; ; GFX942-LABEL: s_shuffle_v2i32_v3i32__4_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -2270,6 +2306,7 @@ define void @s_shuffle_v2i32_v3i32__4_u() { define void @s_shuffle_v2i32_v3i32__5_u() { ; GFX900-LABEL: s_shuffle_v2i32_v3i32__5_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -2282,6 +2319,7 @@ define void @s_shuffle_v2i32_v3i32__5_u() { ; ; GFX90A-LABEL: s_shuffle_v2i32_v3i32__5_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -2294,6 +2332,7 @@ define void @s_shuffle_v2i32_v3i32__5_u() { ; ; GFX942-LABEL: s_shuffle_v2i32_v3i32__5_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -2586,6 +2625,7 @@ define void @s_shuffle_v2i32_v3i32__5_5() { define void @s_shuffle_v2i32_v3i32__u_0() { ; GFX900-LABEL: s_shuffle_v2i32_v3i32__u_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -2598,6 +2638,7 @@ define void @s_shuffle_v2i32_v3i32__u_0() { ; ; GFX90A-LABEL: s_shuffle_v2i32_v3i32__u_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -2610,6 +2651,7 @@ define void @s_shuffle_v2i32_v3i32__u_0() { ; ; GFX942-LABEL: s_shuffle_v2i32_v3i32__u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -2736,6 +2778,7 @@ define void @s_shuffle_v2i32_v3i32__2_0() { define void @s_shuffle_v2i32_v3i32__3_0() { ; GFX900-LABEL: s_shuffle_v2i32_v3i32__3_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -2748,6 +2791,7 @@ define void @s_shuffle_v2i32_v3i32__3_0() { ; ; GFX90A-LABEL: s_shuffle_v2i32_v3i32__3_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -2760,6 +2804,7 @@ define void @s_shuffle_v2i32_v3i32__3_0() { ; ; GFX942-LABEL: s_shuffle_v2i32_v3i32__3_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -3041,6 +3086,7 @@ define void @s_shuffle_v2i32_v3i32__4_1() { define void @s_shuffle_v2i32_v3i32__u_2() { ; GFX900-LABEL: s_shuffle_v2i32_v3i32__u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -3053,6 +3099,7 @@ define void @s_shuffle_v2i32_v3i32__u_2() { ; ; GFX90A-LABEL: s_shuffle_v2i32_v3i32__u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -3065,6 +3112,7 @@ define void @s_shuffle_v2i32_v3i32__u_2() { ; ; GFX942-LABEL: s_shuffle_v2i32_v3i32__u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -3191,6 +3239,7 @@ define void @s_shuffle_v2i32_v3i32__2_2() { define void @s_shuffle_v2i32_v3i32__3_2() { ; GFX900-LABEL: s_shuffle_v2i32_v3i32__3_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -3203,6 +3252,7 @@ define void @s_shuffle_v2i32_v3i32__3_2() { ; ; GFX90A-LABEL: s_shuffle_v2i32_v3i32__3_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -3215,6 +3265,7 @@ define void @s_shuffle_v2i32_v3i32__3_2() { ; ; GFX942-LABEL: s_shuffle_v2i32_v3i32__3_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -3342,6 +3393,7 @@ define void @s_shuffle_v2i32_v3i32__0_3() { define void @s_shuffle_v2i32_v3i32__1_3() { ; GFX900-LABEL: s_shuffle_v2i32_v3i32__1_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -3354,6 +3406,7 @@ define void @s_shuffle_v2i32_v3i32__1_3() { ; ; GFX90A-LABEL: s_shuffle_v2i32_v3i32__1_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -3366,6 +3419,7 @@ define void @s_shuffle_v2i32_v3i32__1_3() { ; ; GFX942-LABEL: s_shuffle_v2i32_v3i32__1_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -3384,6 +3438,7 @@ define void @s_shuffle_v2i32_v3i32__1_3() { define void @s_shuffle_v2i32_v3i32__2_3() { ; GFX900-LABEL: s_shuffle_v2i32_v3i32__2_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -3396,6 +3451,7 @@ define void @s_shuffle_v2i32_v3i32__2_3() { ; ; GFX90A-LABEL: s_shuffle_v2i32_v3i32__2_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -3408,6 +3464,7 @@ define void @s_shuffle_v2i32_v3i32__2_3() { ; ; GFX942-LABEL: s_shuffle_v2i32_v3i32__2_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -3743,6 +3800,7 @@ define void @s_shuffle_v2i32_v3i32__4_4() { define void @s_shuffle_v2i32_v3i32__u_5() { ; GFX900-LABEL: s_shuffle_v2i32_v3i32__u_5: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -3755,6 +3813,7 @@ define void @s_shuffle_v2i32_v3i32__u_5() { ; ; GFX90A-LABEL: s_shuffle_v2i32_v3i32__u_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -3767,6 +3826,7 @@ define void @s_shuffle_v2i32_v3i32__u_5() { ; ; GFX942-LABEL: s_shuffle_v2i32_v3i32__u_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v4i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v4i32.ll index 37df1b6a72e03..42b6563a0c8e8 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v4i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v4i32.ll @@ -72,9 +72,10 @@ define void @v_shuffle_v2i32_v4i32__1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -84,9 +85,10 @@ define void @v_shuffle_v2i32_v4i32__1_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -141,6 +143,7 @@ define void @v_shuffle_v2i32_v4i32__3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v3 ; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] @@ -153,6 +156,7 @@ define void @v_shuffle_v2i32_v4i32__3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] @@ -165,6 +169,7 @@ define void @v_shuffle_v2i32_v4i32__3_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -205,9 +210,10 @@ define void @v_shuffle_v2i32_v4i32__5_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -217,9 +223,10 @@ define void @v_shuffle_v2i32_v4i32__5_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -276,6 +283,7 @@ define void @v_shuffle_v2i32_v4i32__7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v3 ; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] @@ -288,6 +296,7 @@ define void @v_shuffle_v2i32_v4i32__7_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] @@ -300,6 +309,7 @@ define void @v_shuffle_v2i32_v4i32__7_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -707,9 +717,10 @@ define void @v_shuffle_v2i32_v4i32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -719,9 +730,10 @@ define void @v_shuffle_v2i32_v4i32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -731,9 +743,10 @@ define void @v_shuffle_v2i32_v4i32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -920,9 +933,10 @@ define void @v_shuffle_v2i32_v4i32__4_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -932,9 +946,10 @@ define void @v_shuffle_v2i32_v4i32__4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -944,9 +959,10 @@ define void @v_shuffle_v2i32_v4i32__4_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1428,6 +1444,7 @@ define void @v_shuffle_v2i32_v4i32__u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] @@ -1440,6 +1457,7 @@ define void @v_shuffle_v2i32_v4i32__u_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -1638,6 +1656,7 @@ define void @v_shuffle_v2i32_v4i32__4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] @@ -1650,6 +1669,7 @@ define void @v_shuffle_v2i32_v4i32__4_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -2184,9 +2204,10 @@ define void @v_shuffle_v2i32_v4i32__1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2196,9 +2217,10 @@ define void @v_shuffle_v2i32_v4i32__1_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2253,6 +2275,7 @@ define void @v_shuffle_v2i32_v4i32__3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v3 ; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] @@ -2265,6 +2288,7 @@ define void @v_shuffle_v2i32_v4i32__3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] @@ -2277,6 +2301,7 @@ define void @v_shuffle_v2i32_v4i32__3_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -2781,6 +2806,7 @@ define void @v_shuffle_v2i32_v4i32__u_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] @@ -2793,6 +2819,7 @@ define void @v_shuffle_v2i32_v4i32__u_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -3582,6 +3609,7 @@ define void @s_shuffle_v2i32_v4i32__0_u() { define void @s_shuffle_v2i32_v4i32__1_u() { ; GFX900-LABEL: s_shuffle_v2i32_v4i32__1_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -3594,6 +3622,7 @@ define void @s_shuffle_v2i32_v4i32__1_u() { ; ; GFX90A-LABEL: s_shuffle_v2i32_v4i32__1_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -3606,6 +3635,7 @@ define void @s_shuffle_v2i32_v4i32__1_u() { ; ; GFX942-LABEL: s_shuffle_v2i32_v4i32__1_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -3666,6 +3696,7 @@ define void @s_shuffle_v2i32_v4i32__2_u() { define void @s_shuffle_v2i32_v4i32__3_u() { ; GFX900-LABEL: s_shuffle_v2i32_v4i32__3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -3678,6 +3709,7 @@ define void @s_shuffle_v2i32_v4i32__3_u() { ; ; GFX90A-LABEL: s_shuffle_v2i32_v4i32__3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -3690,6 +3722,7 @@ define void @s_shuffle_v2i32_v4i32__3_u() { ; ; GFX942-LABEL: s_shuffle_v2i32_v4i32__3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -3722,6 +3755,7 @@ define void @s_shuffle_v2i32_v4i32__4_u() { define void @s_shuffle_v2i32_v4i32__5_u() { ; GFX900-LABEL: s_shuffle_v2i32_v4i32__5_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -3734,6 +3768,7 @@ define void @s_shuffle_v2i32_v4i32__5_u() { ; ; GFX90A-LABEL: s_shuffle_v2i32_v4i32__5_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -3746,6 +3781,7 @@ define void @s_shuffle_v2i32_v4i32__5_u() { ; ; GFX942-LABEL: s_shuffle_v2i32_v4i32__5_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -3808,6 +3844,7 @@ define void @s_shuffle_v2i32_v4i32__6_u() { define void @s_shuffle_v2i32_v4i32__7_u() { ; GFX900-LABEL: s_shuffle_v2i32_v4i32__7_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -3820,6 +3857,7 @@ define void @s_shuffle_v2i32_v4i32__7_u() { ; ; GFX90A-LABEL: s_shuffle_v2i32_v4i32__7_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -3832,6 +3870,7 @@ define void @s_shuffle_v2i32_v4i32__7_u() { ; ; GFX942-LABEL: s_shuffle_v2i32_v4i32__7_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -4225,6 +4264,7 @@ define void @s_shuffle_v2i32_v4i32__7_7() { define void @s_shuffle_v2i32_v4i32__u_0() { ; GFX900-LABEL: s_shuffle_v2i32_v4i32__u_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -4237,6 +4277,7 @@ define void @s_shuffle_v2i32_v4i32__u_0() { ; ; GFX90A-LABEL: s_shuffle_v2i32_v4i32__u_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -4249,6 +4290,7 @@ define void @s_shuffle_v2i32_v4i32__u_0() { ; ; GFX942-LABEL: s_shuffle_v2i32_v4i32__u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -4420,6 +4462,7 @@ define void @s_shuffle_v2i32_v4i32__3_0() { define void @s_shuffle_v2i32_v4i32__4_0() { ; GFX900-LABEL: s_shuffle_v2i32_v4i32__4_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -4432,6 +4475,7 @@ define void @s_shuffle_v2i32_v4i32__4_0() { ; ; GFX90A-LABEL: s_shuffle_v2i32_v4i32__4_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -4444,6 +4488,7 @@ define void @s_shuffle_v2i32_v4i32__4_0() { ; ; GFX942-LABEL: s_shuffle_v2i32_v4i32__4_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -4880,6 +4925,7 @@ define void @s_shuffle_v2i32_v4i32__6_1() { define void @s_shuffle_v2i32_v4i32__u_2() { ; GFX900-LABEL: s_shuffle_v2i32_v4i32__u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -4892,6 +4938,7 @@ define void @s_shuffle_v2i32_v4i32__u_2() { ; ; GFX90A-LABEL: s_shuffle_v2i32_v4i32__u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -4904,6 +4951,7 @@ define void @s_shuffle_v2i32_v4i32__u_2() { ; ; GFX942-LABEL: s_shuffle_v2i32_v4i32__u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -5075,6 +5123,7 @@ define void @s_shuffle_v2i32_v4i32__3_2() { define void @s_shuffle_v2i32_v4i32__4_2() { ; GFX900-LABEL: s_shuffle_v2i32_v4i32__4_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -5087,6 +5136,7 @@ define void @s_shuffle_v2i32_v4i32__4_2() { ; ; GFX90A-LABEL: s_shuffle_v2i32_v4i32__4_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -5099,6 +5149,7 @@ define void @s_shuffle_v2i32_v4i32__4_2() { ; ; GFX942-LABEL: s_shuffle_v2i32_v4i32__4_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -5625,6 +5676,7 @@ define void @s_shuffle_v2i32_v4i32__0_4() { define void @s_shuffle_v2i32_v4i32__1_4() { ; GFX900-LABEL: s_shuffle_v2i32_v4i32__1_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -5637,6 +5689,7 @@ define void @s_shuffle_v2i32_v4i32__1_4() { ; ; GFX90A-LABEL: s_shuffle_v2i32_v4i32__1_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -5649,6 +5702,7 @@ define void @s_shuffle_v2i32_v4i32__1_4() { ; ; GFX942-LABEL: s_shuffle_v2i32_v4i32__1_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -5709,6 +5763,7 @@ define void @s_shuffle_v2i32_v4i32__2_4() { define void @s_shuffle_v2i32_v4i32__3_4() { ; GFX900-LABEL: s_shuffle_v2i32_v4i32__3_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -5721,6 +5776,7 @@ define void @s_shuffle_v2i32_v4i32__3_4() { ; ; GFX90A-LABEL: s_shuffle_v2i32_v4i32__3_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -5733,6 +5789,7 @@ define void @s_shuffle_v2i32_v4i32__3_4() { ; ; GFX942-LABEL: s_shuffle_v2i32_v4i32__3_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -6215,6 +6272,7 @@ define void @s_shuffle_v2i32_v4i32__6_5() { define void @s_shuffle_v2i32_v4i32__u_6() { ; GFX900-LABEL: s_shuffle_v2i32_v4i32__u_6: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -6227,6 +6285,7 @@ define void @s_shuffle_v2i32_v4i32__u_6() { ; ; GFX90A-LABEL: s_shuffle_v2i32_v4i32__u_6: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -6239,6 +6298,7 @@ define void @s_shuffle_v2i32_v4i32__u_6() { ; ; GFX942-LABEL: s_shuffle_v2i32_v4i32__u_6: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v8i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v8i32.ll index 94ee1774c2766..6763eb143a4d0 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v8i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v8i32.ll @@ -72,9 +72,10 @@ define void @v_shuffle_v2i32_v8i32__1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -84,9 +85,10 @@ define void @v_shuffle_v2i32_v8i32__1_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i32> asm "; def $0", "=v"() @@ -152,6 +154,7 @@ define void @v_shuffle_v2i32_v8i32__3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -164,6 +167,7 @@ define void @v_shuffle_v2i32_v8i32__3_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -232,6 +236,7 @@ define void @v_shuffle_v2i32_v8i32__5_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v5 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -244,6 +249,7 @@ define void @v_shuffle_v2i32_v8i32__5_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v5 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -301,6 +307,7 @@ define void @v_shuffle_v2i32_v8i32__7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v7 ; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -313,6 +320,7 @@ define void @v_shuffle_v2i32_v8i32__7_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v7 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -325,6 +333,7 @@ define void @v_shuffle_v2i32_v8i32__7_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v7 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -365,9 +374,10 @@ define void @v_shuffle_v2i32_v8i32__9_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -377,9 +387,10 @@ define void @v_shuffle_v2i32_v8i32__9_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i32> asm "; def $0", "=v"() @@ -447,6 +458,7 @@ define void @v_shuffle_v2i32_v8i32__11_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -459,6 +471,7 @@ define void @v_shuffle_v2i32_v8i32__11_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -529,6 +542,7 @@ define void @v_shuffle_v2i32_v8i32__13_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v5 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -541,6 +555,7 @@ define void @v_shuffle_v2i32_v8i32__13_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v5 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -600,6 +615,7 @@ define void @v_shuffle_v2i32_v8i32__15_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v7 ; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -612,6 +628,7 @@ define void @v_shuffle_v2i32_v8i32__15_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v7 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -624,6 +641,7 @@ define void @v_shuffle_v2i32_v8i32__15_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v7 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -1419,9 +1437,10 @@ define void @v_shuffle_v2i32_v8i32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1431,9 +1450,10 @@ define void @v_shuffle_v2i32_v8i32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1443,9 +1463,10 @@ define void @v_shuffle_v2i32_v8i32__u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i32> asm "; def $0", "=v"() @@ -1804,9 +1825,10 @@ define void @v_shuffle_v2i32_v8i32__8_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1816,9 +1838,10 @@ define void @v_shuffle_v2i32_v8i32__8_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1828,9 +1851,10 @@ define void @v_shuffle_v2i32_v8i32__8_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i32> asm "; def $0", "=v"() @@ -2904,6 +2928,7 @@ define void @v_shuffle_v2i32_v8i32__u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -2916,6 +2941,7 @@ define void @v_shuffle_v2i32_v8i32__u_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -3286,6 +3312,7 @@ define void @v_shuffle_v2i32_v8i32__8_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -3298,6 +3325,7 @@ define void @v_shuffle_v2i32_v8i32__8_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -4374,6 +4402,7 @@ define void @v_shuffle_v2i32_v8i32__u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -4386,6 +4415,7 @@ define void @v_shuffle_v2i32_v8i32__u_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -4756,6 +4786,7 @@ define void @v_shuffle_v2i32_v8i32__8_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -4768,6 +4799,7 @@ define void @v_shuffle_v2i32_v8i32__8_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -5844,6 +5876,7 @@ define void @v_shuffle_v2i32_v8i32__u_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -5856,6 +5889,7 @@ define void @v_shuffle_v2i32_v8i32__u_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -6226,6 +6260,7 @@ define void @v_shuffle_v2i32_v8i32__8_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -6238,6 +6273,7 @@ define void @v_shuffle_v2i32_v8i32__8_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -7364,9 +7400,10 @@ define void @v_shuffle_v2i32_v8i32__1_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7376,9 +7413,10 @@ define void @v_shuffle_v2i32_v8i32__1_8(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i32> asm "; def $0", "=v"() @@ -7444,6 +7482,7 @@ define void @v_shuffle_v2i32_v8i32__3_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -7456,6 +7495,7 @@ define void @v_shuffle_v2i32_v8i32__3_8(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -7524,6 +7564,7 @@ define void @v_shuffle_v2i32_v8i32__5_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v5 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -7536,6 +7577,7 @@ define void @v_shuffle_v2i32_v8i32__5_8(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v5 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -7593,6 +7635,7 @@ define void @v_shuffle_v2i32_v8i32__7_8(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v7 ; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -7605,6 +7648,7 @@ define void @v_shuffle_v2i32_v8i32__7_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v7 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -7617,6 +7661,7 @@ define void @v_shuffle_v2i32_v8i32__7_8(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v7 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -8681,6 +8726,7 @@ define void @v_shuffle_v2i32_v8i32__u_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -8693,6 +8739,7 @@ define void @v_shuffle_v2i32_v8i32__u_10(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -10213,6 +10260,7 @@ define void @v_shuffle_v2i32_v8i32__u_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -10225,6 +10273,7 @@ define void @v_shuffle_v2i32_v8i32__u_12(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -11745,6 +11794,7 @@ define void @v_shuffle_v2i32_v8i32__u_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -11757,6 +11807,7 @@ define void @v_shuffle_v2i32_v8i32__u_14(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -13322,6 +13373,7 @@ define void @s_shuffle_v2i32_v8i32__1_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s8, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -13334,6 +13386,7 @@ define void @s_shuffle_v2i32_v8i32__1_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s8, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -13342,6 +13395,7 @@ define void @s_shuffle_v2i32_v8i32__1_u() { ; ; GFX942-LABEL: s_shuffle_v2i32_v8i32__1_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13406,6 +13460,7 @@ define void @s_shuffle_v2i32_v8i32__3_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s8, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -13418,6 +13473,7 @@ define void @s_shuffle_v2i32_v8i32__3_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s8, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -13426,6 +13482,7 @@ define void @s_shuffle_v2i32_v8i32__3_u() { ; ; GFX942-LABEL: s_shuffle_v2i32_v8i32__3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13486,9 +13543,10 @@ define void @s_shuffle_v2i32_v8i32__5_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s9 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX900-NEXT: s_mov_b32 s8, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -13498,9 +13556,10 @@ define void @s_shuffle_v2i32_v8i32__5_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s9 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX90A-NEXT: s_mov_b32 s8, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -13508,6 +13567,7 @@ define void @s_shuffle_v2i32_v8i32__5_u() { ; ; GFX942-LABEL: s_shuffle_v2i32_v8i32__5_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13572,6 +13632,7 @@ define void @s_shuffle_v2i32_v8i32__7_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s8, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -13584,6 +13645,7 @@ define void @s_shuffle_v2i32_v8i32__7_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s8, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -13592,6 +13654,7 @@ define void @s_shuffle_v2i32_v8i32__7_u() { ; ; GFX942-LABEL: s_shuffle_v2i32_v8i32__7_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13628,6 +13691,7 @@ define void @s_shuffle_v2i32_v8i32__9_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s8, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -13640,6 +13704,7 @@ define void @s_shuffle_v2i32_v8i32__9_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s8, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -13648,6 +13713,7 @@ define void @s_shuffle_v2i32_v8i32__9_u() { ; ; GFX942-LABEL: s_shuffle_v2i32_v8i32__9_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13714,6 +13780,7 @@ define void @s_shuffle_v2i32_v8i32__11_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s8, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -13726,6 +13793,7 @@ define void @s_shuffle_v2i32_v8i32__11_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s8, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -13734,6 +13802,7 @@ define void @s_shuffle_v2i32_v8i32__11_u() { ; ; GFX942-LABEL: s_shuffle_v2i32_v8i32__11_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13796,9 +13865,10 @@ define void @s_shuffle_v2i32_v8i32__13_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s9 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX900-NEXT: s_mov_b32 s8, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -13808,9 +13878,10 @@ define void @s_shuffle_v2i32_v8i32__13_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s9 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX90A-NEXT: s_mov_b32 s8, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -13818,6 +13889,7 @@ define void @s_shuffle_v2i32_v8i32__13_u() { ; ; GFX942-LABEL: s_shuffle_v2i32_v8i32__13_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13884,6 +13956,7 @@ define void @s_shuffle_v2i32_v8i32__15_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s8, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -13896,6 +13969,7 @@ define void @s_shuffle_v2i32_v8i32__15_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s8, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -13904,6 +13978,7 @@ define void @s_shuffle_v2i32_v8i32__15_u() { ; ; GFX942-LABEL: s_shuffle_v2i32_v8i32__15_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -14679,6 +14754,7 @@ define void @s_shuffle_v2i32_v8i32__u_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s9, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -14691,6 +14767,7 @@ define void @s_shuffle_v2i32_v8i32__u_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s9, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -14699,6 +14776,7 @@ define void @s_shuffle_v2i32_v8i32__u_0() { ; ; GFX942-LABEL: s_shuffle_v2i32_v8i32__u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -15027,6 +15105,7 @@ define void @s_shuffle_v2i32_v8i32__8_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s9, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -15039,6 +15118,7 @@ define void @s_shuffle_v2i32_v8i32__8_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s9, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -15047,6 +15127,7 @@ define void @s_shuffle_v2i32_v8i32__8_0() { ; ; GFX942-LABEL: s_shuffle_v2i32_v8i32__8_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -16023,6 +16104,7 @@ define void @s_shuffle_v2i32_v8i32__u_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s9, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -16035,6 +16117,7 @@ define void @s_shuffle_v2i32_v8i32__u_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s9, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -16043,6 +16126,7 @@ define void @s_shuffle_v2i32_v8i32__u_2() { ; ; GFX942-LABEL: s_shuffle_v2i32_v8i32__u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -16371,6 +16455,7 @@ define void @s_shuffle_v2i32_v8i32__8_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s9, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -16383,6 +16468,7 @@ define void @s_shuffle_v2i32_v8i32__8_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s9, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -16391,6 +16477,7 @@ define void @s_shuffle_v2i32_v8i32__8_2() { ; ; GFX942-LABEL: s_shuffle_v2i32_v8i32__8_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -17464,9 +17551,10 @@ define void @s_shuffle_v2i32_v8i32__u_4() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s8 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX900-NEXT: s_mov_b32 s9, s12 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -17476,9 +17564,10 @@ define void @s_shuffle_v2i32_v8i32__u_4() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s8 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX90A-NEXT: s_mov_b32 s9, s12 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -17486,6 +17575,7 @@ define void @s_shuffle_v2i32_v8i32__u_4() { ; ; GFX942-LABEL: s_shuffle_v2i32_v8i32__u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -17812,9 +17902,10 @@ define void @s_shuffle_v2i32_v8i32__8_4() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s8 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX900-NEXT: s_mov_b32 s9, s12 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -17824,9 +17915,10 @@ define void @s_shuffle_v2i32_v8i32__8_4() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s8 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX90A-NEXT: s_mov_b32 s9, s12 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -17834,6 +17926,7 @@ define void @s_shuffle_v2i32_v8i32__8_4() { ; ; GFX942-LABEL: s_shuffle_v2i32_v8i32__8_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -18806,6 +18899,7 @@ define void @s_shuffle_v2i32_v8i32__u_6() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s9, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -18818,6 +18912,7 @@ define void @s_shuffle_v2i32_v8i32__u_6() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s9, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -18826,6 +18921,7 @@ define void @s_shuffle_v2i32_v8i32__u_6() { ; ; GFX942-LABEL: s_shuffle_v2i32_v8i32__u_6: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -19154,6 +19250,7 @@ define void @s_shuffle_v2i32_v8i32__8_6() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s9, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -19166,6 +19263,7 @@ define void @s_shuffle_v2i32_v8i32__8_6() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s9, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -19174,6 +19272,7 @@ define void @s_shuffle_v2i32_v8i32__8_6() { ; ; GFX942-LABEL: s_shuffle_v2i32_v8i32__8_6: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -20293,6 +20392,7 @@ define void @s_shuffle_v2i32_v8i32__1_8() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s8, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -20305,6 +20405,7 @@ define void @s_shuffle_v2i32_v8i32__1_8() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s8, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -20313,6 +20414,7 @@ define void @s_shuffle_v2i32_v8i32__1_8() { ; ; GFX942-LABEL: s_shuffle_v2i32_v8i32__1_8: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -20377,6 +20479,7 @@ define void @s_shuffle_v2i32_v8i32__3_8() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s8, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -20389,6 +20492,7 @@ define void @s_shuffle_v2i32_v8i32__3_8() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s8, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -20397,6 +20501,7 @@ define void @s_shuffle_v2i32_v8i32__3_8() { ; ; GFX942-LABEL: s_shuffle_v2i32_v8i32__3_8: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -20457,9 +20562,10 @@ define void @s_shuffle_v2i32_v8i32__5_8() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s9 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX900-NEXT: s_mov_b32 s8, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -20469,9 +20575,10 @@ define void @s_shuffle_v2i32_v8i32__5_8() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s9 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX90A-NEXT: s_mov_b32 s8, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -20479,6 +20586,7 @@ define void @s_shuffle_v2i32_v8i32__5_8() { ; ; GFX942-LABEL: s_shuffle_v2i32_v8i32__5_8: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -20543,6 +20651,7 @@ define void @s_shuffle_v2i32_v8i32__7_8() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s8, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -20555,6 +20664,7 @@ define void @s_shuffle_v2i32_v8i32__7_8() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s8, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -20563,6 +20673,7 @@ define void @s_shuffle_v2i32_v8i32__7_8() { ; ; GFX942-LABEL: s_shuffle_v2i32_v8i32__7_8: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -21522,6 +21633,7 @@ define void @s_shuffle_v2i32_v8i32__u_10() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s9, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -21534,6 +21646,7 @@ define void @s_shuffle_v2i32_v8i32__u_10() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s9, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -21542,6 +21655,7 @@ define void @s_shuffle_v2i32_v8i32__u_10() { ; ; GFX942-LABEL: s_shuffle_v2i32_v8i32__u_10: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -23015,9 +23129,10 @@ define void @s_shuffle_v2i32_v8i32__u_12() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s8 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX900-NEXT: s_mov_b32 s9, s12 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -23027,9 +23142,10 @@ define void @s_shuffle_v2i32_v8i32__u_12() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s8 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX90A-NEXT: s_mov_b32 s9, s12 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -23037,6 +23153,7 @@ define void @s_shuffle_v2i32_v8i32__u_12() { ; ; GFX942-LABEL: s_shuffle_v2i32_v8i32__u_12: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -24442,6 +24559,7 @@ define void @s_shuffle_v2i32_v8i32__u_14() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s9, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -24454,6 +24572,7 @@ define void @s_shuffle_v2i32_v8i32__u_14() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s9, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -24462,6 +24581,7 @@ define void @s_shuffle_v2i32_v8i32__u_14() { ; ; GFX942-LABEL: s_shuffle_v2i32_v8i32__u_14: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll index 51dc9a51ec9d0..344a262a655ba 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll @@ -57,40 +57,44 @@ define void @v_shuffle_v2i64_v2i64__0_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v2i64_v2i64__1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v2i64__1_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v2i64__1_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v2i64__1_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -113,40 +117,44 @@ define void @v_shuffle_v2i64_v2i64__2_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v2i64_v2i64__3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v2i64__3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v2i64__3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v2i64__3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -374,10 +382,11 @@ define void @v_shuffle_v2i64_v2i64__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -387,10 +396,11 @@ define void @v_shuffle_v2i64_v2i64__u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -400,10 +410,11 @@ define void @v_shuffle_v2i64_v2i64__u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -510,10 +521,11 @@ define void @v_shuffle_v2i64_v2i64__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -523,10 +535,11 @@ define void @v_shuffle_v2i64_v2i64__2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -536,10 +549,11 @@ define void @v_shuffle_v2i64_v2i64__2_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -763,40 +777,44 @@ define void @v_shuffle_v2i64_v2i64__0_2(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v2i64_v2i64__1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2i64_v2i64__1_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2i64_v2i64__1_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2i64_v2i64__1_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1065,6 +1083,7 @@ define void @s_shuffle_v2i64_v2i64__0_u() { define void @s_shuffle_v2i64_v2i64__1_u() { ; GFX900-LABEL: s_shuffle_v2i64_v2i64__1_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -1078,6 +1097,7 @@ define void @s_shuffle_v2i64_v2i64__1_u() { ; ; GFX90A-LABEL: s_shuffle_v2i64_v2i64__1_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -1091,6 +1111,7 @@ define void @s_shuffle_v2i64_v2i64__1_u() { ; ; GFX942-LABEL: s_shuffle_v2i64_v2i64__1_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -1124,6 +1145,7 @@ define void @s_shuffle_v2i64_v2i64__2_u() { define void @s_shuffle_v2i64_v2i64__3_u() { ; GFX900-LABEL: s_shuffle_v2i64_v2i64__3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -1137,6 +1159,7 @@ define void @s_shuffle_v2i64_v2i64__3_u() { ; ; GFX90A-LABEL: s_shuffle_v2i64_v2i64__3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -1150,6 +1173,7 @@ define void @s_shuffle_v2i64_v2i64__3_u() { ; ; GFX942-LABEL: s_shuffle_v2i64_v2i64__3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -1358,6 +1382,7 @@ define void @s_shuffle_v2i64_v2i64__3_3() { define void @s_shuffle_v2i64_v2i64__u_0() { ; GFX900-LABEL: s_shuffle_v2i64_v2i64__u_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -1371,6 +1396,7 @@ define void @s_shuffle_v2i64_v2i64__u_0() { ; ; GFX90A-LABEL: s_shuffle_v2i64_v2i64__u_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -1384,6 +1410,7 @@ define void @s_shuffle_v2i64_v2i64__u_0() { ; ; GFX942-LABEL: s_shuffle_v2i64_v2i64__u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -1473,6 +1500,7 @@ define void @s_shuffle_v2i64_v2i64__1_0() { define void @s_shuffle_v2i64_v2i64__2_0() { ; GFX900-LABEL: s_shuffle_v2i64_v2i64__2_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -1486,6 +1514,7 @@ define void @s_shuffle_v2i64_v2i64__2_0() { ; ; GFX90A-LABEL: s_shuffle_v2i64_v2i64__2_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -1499,6 +1528,7 @@ define void @s_shuffle_v2i64_v2i64__2_0() { ; ; GFX942-LABEL: s_shuffle_v2i64_v2i64__2_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -1711,6 +1741,7 @@ define void @s_shuffle_v2i64_v2i64__0_2() { define void @s_shuffle_v2i64_v2i64__1_2() { ; GFX900-LABEL: s_shuffle_v2i64_v2i64__1_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -1724,6 +1755,7 @@ define void @s_shuffle_v2i64_v2i64__1_2() { ; ; GFX90A-LABEL: s_shuffle_v2i64_v2i64__1_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -1737,6 +1769,7 @@ define void @s_shuffle_v2i64_v2i64__1_2() { ; ; GFX942-LABEL: s_shuffle_v2i64_v2i64__1_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v3i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v3i64.ll index bc8a56a30d8f9..1f9333b146f1c 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v3i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v3i64.ll @@ -100,6 +100,7 @@ define void @v_shuffle_v2i64_v3i64__2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -113,6 +114,7 @@ define void @v_shuffle_v2i64_v3i64__2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -126,6 +128,7 @@ define void @v_shuffle_v2i64_v3i64__2_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -196,6 +199,7 @@ define void @v_shuffle_v2i64_v3i64__5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -209,6 +213,7 @@ define void @v_shuffle_v2i64_v3i64__5_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -222,6 +227,7 @@ define void @v_shuffle_v2i64_v3i64__5_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -560,10 +566,11 @@ define void @v_shuffle_v2i64_v3i64__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -573,10 +580,11 @@ define void @v_shuffle_v2i64_v3i64__u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -586,10 +594,11 @@ define void @v_shuffle_v2i64_v3i64__u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -746,10 +755,11 @@ define void @v_shuffle_v2i64_v3i64__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -759,10 +769,11 @@ define void @v_shuffle_v2i64_v3i64__3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -772,10 +783,11 @@ define void @v_shuffle_v2i64_v3i64__3_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1462,6 +1474,7 @@ define void @v_shuffle_v2i64_v3i64__2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -1475,6 +1488,7 @@ define void @v_shuffle_v2i64_v3i64__2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -1488,6 +1502,7 @@ define void @v_shuffle_v2i64_v3i64__2_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -2206,6 +2221,7 @@ define void @s_shuffle_v2i64_v3i64__1_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -2219,6 +2235,7 @@ define void @s_shuffle_v2i64_v3i64__1_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -2228,6 +2245,7 @@ define void @s_shuffle_v2i64_v3i64__1_u() { ; ; GFX942-LABEL: s_shuffle_v2i64_v3i64__1_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -2249,8 +2267,11 @@ define void @s_shuffle_v2i64_v3i64__2_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -2260,8 +2281,11 @@ define void @s_shuffle_v2i64_v3i64__2_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -2269,6 +2293,7 @@ define void @s_shuffle_v2i64_v3i64__2_u() { ; ; GFX942-LABEL: s_shuffle_v2i64_v3i64__2_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -2306,6 +2331,7 @@ define void @s_shuffle_v2i64_v3i64__4_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -2319,6 +2345,7 @@ define void @s_shuffle_v2i64_v3i64__4_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -2328,6 +2355,7 @@ define void @s_shuffle_v2i64_v3i64__4_u() { ; ; GFX942-LABEL: s_shuffle_v2i64_v3i64__4_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -2350,8 +2378,11 @@ define void @s_shuffle_v2i64_v3i64__5_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -2361,8 +2392,11 @@ define void @s_shuffle_v2i64_v3i64__5_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -2370,6 +2404,7 @@ define void @s_shuffle_v2i64_v3i64__5_u() { ; ; GFX942-LABEL: s_shuffle_v2i64_v3i64__5_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -2681,6 +2716,7 @@ define void @s_shuffle_v2i64_v3i64__u_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -2694,6 +2730,7 @@ define void @s_shuffle_v2i64_v3i64__u_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -2703,6 +2740,7 @@ define void @s_shuffle_v2i64_v3i64__u_0() { ; ; GFX942-LABEL: s_shuffle_v2i64_v3i64__u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -2843,6 +2881,7 @@ define void @s_shuffle_v2i64_v3i64__3_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -2856,6 +2895,7 @@ define void @s_shuffle_v2i64_v3i64__3_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -2865,6 +2905,7 @@ define void @s_shuffle_v2i64_v3i64__3_0() { ; ; GFX942-LABEL: s_shuffle_v2i64_v3i64__3_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -3160,10 +3201,11 @@ define void @s_shuffle_v2i64_v3i64__u_2() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s8 -; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -3173,10 +3215,11 @@ define void @s_shuffle_v2i64_v3i64__u_2() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s8 -; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -3184,6 +3227,7 @@ define void @s_shuffle_v2i64_v3i64__u_2() { ; ; GFX942-LABEL: s_shuffle_v2i64_v3i64__u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -3326,10 +3370,11 @@ define void @s_shuffle_v2i64_v3i64__3_2() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s8 -; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -3339,10 +3384,11 @@ define void @s_shuffle_v2i64_v3i64__3_2() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s8 -; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -3350,6 +3396,7 @@ define void @s_shuffle_v2i64_v3i64__3_2() { ; ; GFX942-LABEL: s_shuffle_v2i64_v3i64__3_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -3488,6 +3535,7 @@ define void @s_shuffle_v2i64_v3i64__1_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -3501,6 +3549,7 @@ define void @s_shuffle_v2i64_v3i64__1_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -3510,6 +3559,7 @@ define void @s_shuffle_v2i64_v3i64__1_3() { ; ; GFX942-LABEL: s_shuffle_v2i64_v3i64__1_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -3531,8 +3581,11 @@ define void @s_shuffle_v2i64_v3i64__2_3() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -3542,8 +3595,11 @@ define void @s_shuffle_v2i64_v3i64__2_3() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -3551,6 +3607,7 @@ define void @s_shuffle_v2i64_v3i64__2_3() { ; ; GFX942-LABEL: s_shuffle_v2i64_v3i64__2_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -3905,10 +3962,11 @@ define void @s_shuffle_v2i64_v3i64__u_5() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s8 -; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -3918,10 +3976,11 @@ define void @s_shuffle_v2i64_v3i64__u_5() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s8 -; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -3929,6 +3988,7 @@ define void @s_shuffle_v2i64_v3i64__u_5() { ; ; GFX942-LABEL: s_shuffle_v2i64_v3i64__u_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll index dd42a1dd44320..e52326bbd6353 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll @@ -139,6 +139,7 @@ define void @v_shuffle_v2i64_v4i64__3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -152,6 +153,7 @@ define void @v_shuffle_v2i64_v4i64__3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -165,6 +167,7 @@ define void @v_shuffle_v2i64_v4i64__3_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -275,6 +278,7 @@ define void @v_shuffle_v2i64_v4i64__7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -288,6 +292,7 @@ define void @v_shuffle_v2i64_v4i64__7_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -301,6 +306,7 @@ define void @v_shuffle_v2i64_v4i64__7_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -741,10 +747,11 @@ define void @v_shuffle_v2i64_v4i64__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -754,10 +761,11 @@ define void @v_shuffle_v2i64_v4i64__u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -767,10 +775,11 @@ define void @v_shuffle_v2i64_v4i64__u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -972,10 +981,11 @@ define void @v_shuffle_v2i64_v4i64__4_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -985,10 +995,11 @@ define void @v_shuffle_v2i64_v4i64__4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -998,10 +1009,11 @@ define void @v_shuffle_v2i64_v4i64__4_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -2349,6 +2361,7 @@ define void @v_shuffle_v2i64_v4i64__3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -2362,6 +2375,7 @@ define void @v_shuffle_v2i64_v4i64__3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -2375,6 +2389,7 @@ define void @v_shuffle_v2i64_v4i64__3_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -3739,6 +3754,7 @@ define void @s_shuffle_v2i64_v4i64__1_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -3752,6 +3768,7 @@ define void @s_shuffle_v2i64_v4i64__1_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -3761,6 +3778,7 @@ define void @s_shuffle_v2i64_v4i64__1_u() { ; ; GFX942-LABEL: s_shuffle_v2i64_v4i64__1_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -3822,10 +3840,11 @@ define void @s_shuffle_v2i64_v4i64__3_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -3835,10 +3854,11 @@ define void @s_shuffle_v2i64_v4i64__3_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -3846,6 +3866,7 @@ define void @s_shuffle_v2i64_v4i64__3_u() { ; ; GFX942-LABEL: s_shuffle_v2i64_v4i64__3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -3883,6 +3904,7 @@ define void @s_shuffle_v2i64_v4i64__5_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -3896,6 +3918,7 @@ define void @s_shuffle_v2i64_v4i64__5_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -3905,6 +3928,7 @@ define void @s_shuffle_v2i64_v4i64__5_u() { ; ; GFX942-LABEL: s_shuffle_v2i64_v4i64__5_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -3968,10 +3992,11 @@ define void @s_shuffle_v2i64_v4i64__7_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -3981,10 +4006,11 @@ define void @s_shuffle_v2i64_v4i64__7_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -3992,6 +4018,7 @@ define void @s_shuffle_v2i64_v4i64__7_u() { ; ; GFX942-LABEL: s_shuffle_v2i64_v4i64__7_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -4394,6 +4421,7 @@ define void @s_shuffle_v2i64_v4i64__u_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -4407,6 +4435,7 @@ define void @s_shuffle_v2i64_v4i64__u_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -4416,6 +4445,7 @@ define void @s_shuffle_v2i64_v4i64__u_0() { ; ; GFX942-LABEL: s_shuffle_v2i64_v4i64__u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -4579,6 +4609,7 @@ define void @s_shuffle_v2i64_v4i64__4_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -4592,6 +4623,7 @@ define void @s_shuffle_v2i64_v4i64__4_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -4601,6 +4633,7 @@ define void @s_shuffle_v2i64_v4i64__4_0() { ; ; GFX942-LABEL: s_shuffle_v2i64_v4i64__4_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -5028,10 +5061,11 @@ define void @s_shuffle_v2i64_v4i64__u_2() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s8 -; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -5041,10 +5075,11 @@ define void @s_shuffle_v2i64_v4i64__u_2() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s8 -; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -5052,6 +5087,7 @@ define void @s_shuffle_v2i64_v4i64__u_2() { ; ; GFX942-LABEL: s_shuffle_v2i64_v4i64__u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -5213,10 +5249,11 @@ define void @s_shuffle_v2i64_v4i64__4_2() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s8 -; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -5226,10 +5263,11 @@ define void @s_shuffle_v2i64_v4i64__4_2() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s8 -; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -5237,6 +5275,7 @@ define void @s_shuffle_v2i64_v4i64__4_2() { ; ; GFX942-LABEL: s_shuffle_v2i64_v4i64__4_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -5720,6 +5759,7 @@ define void @s_shuffle_v2i64_v4i64__1_4() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -5733,6 +5773,7 @@ define void @s_shuffle_v2i64_v4i64__1_4() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -5742,6 +5783,7 @@ define void @s_shuffle_v2i64_v4i64__1_4() { ; ; GFX942-LABEL: s_shuffle_v2i64_v4i64__1_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -5803,10 +5845,11 @@ define void @s_shuffle_v2i64_v4i64__3_4() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -5816,10 +5859,11 @@ define void @s_shuffle_v2i64_v4i64__3_4() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -5827,6 +5871,7 @@ define void @s_shuffle_v2i64_v4i64__3_4() { ; ; GFX942-LABEL: s_shuffle_v2i64_v4i64__3_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -6277,10 +6322,11 @@ define void @s_shuffle_v2i64_v4i64__u_6() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s8 -; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -6290,10 +6336,11 @@ define void @s_shuffle_v2i64_v4i64__u_6() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s8 -; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -6301,6 +6348,7 @@ define void @s_shuffle_v2i64_v4i64__u_6() { ; ; GFX942-LABEL: s_shuffle_v2i64_v4i64__u_6: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll index 7ee7c83e0122d..819e7876ed15e 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll @@ -295,10 +295,11 @@ define void @v_shuffle_v2i64_v8i64__7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v14 ; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -308,6 +309,7 @@ define void @v_shuffle_v2i64_v8i64__7_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v14 ; GFX90A-NEXT: v_mov_b32_e32 v1, v15 @@ -321,6 +323,7 @@ define void @v_shuffle_v2i64_v8i64__7_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v14 ; GFX942-NEXT: v_mov_b32_e32 v1, v15 @@ -591,10 +594,11 @@ define void @v_shuffle_v2i64_v8i64__15_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v14 ; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -604,6 +608,7 @@ define void @v_shuffle_v2i64_v8i64__15_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v14 ; GFX90A-NEXT: v_mov_b32_e32 v1, v15 @@ -617,6 +622,7 @@ define void @v_shuffle_v2i64_v8i64__15_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v14 ; GFX942-NEXT: v_mov_b32_e32 v1, v15 @@ -1465,10 +1471,11 @@ define void @v_shuffle_v2i64_v8i64__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1478,10 +1485,11 @@ define void @v_shuffle_v2i64_v8i64__u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1491,10 +1499,11 @@ define void @v_shuffle_v2i64_v8i64__u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -1876,10 +1885,11 @@ define void @v_shuffle_v2i64_v8i64__8_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1889,10 +1899,11 @@ define void @v_shuffle_v2i64_v8i64__8_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1902,10 +1913,11 @@ define void @v_shuffle_v2i64_v8i64__8_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=v"() @@ -7917,10 +7929,11 @@ define void @v_shuffle_v2i64_v8i64__7_8(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v14 ; GFX900-NEXT: v_mov_b32_e32 v1, v15 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7930,6 +7943,7 @@ define void @v_shuffle_v2i64_v8i64__7_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v14 ; GFX90A-NEXT: v_mov_b32_e32 v1, v15 @@ -7943,6 +7957,7 @@ define void @v_shuffle_v2i64_v8i64__7_8(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v14 ; GFX942-NEXT: v_mov_b32_e32 v1, v15 @@ -13931,6 +13946,7 @@ define void @s_shuffle_v2i64_v8i64__1_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -13944,6 +13960,7 @@ define void @s_shuffle_v2i64_v8i64__1_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -13957,6 +13974,7 @@ define void @s_shuffle_v2i64_v8i64__1_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_mov_b32 s8, s2 ; GFX942-NEXT: s_mov_b32 s9, s3 ; GFX942-NEXT: ;;#ASMSTART @@ -14014,10 +14032,11 @@ define void @s_shuffle_v2i64_v8i64__3_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ; def s[8:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -14027,10 +14046,11 @@ define void @s_shuffle_v2i64_v8i64__3_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ; def s[8:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -14042,6 +14062,7 @@ define void @s_shuffle_v2i64_v8i64__3_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 ; GFX942-NEXT: ;;#ASMSTART @@ -14099,44 +14120,19 @@ define void @s_shuffle_v2i64_v8i64__4_u() { } define void @s_shuffle_v2i64_v8i64__5_u() { -; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_u: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v2i64_v8i64__5_u: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v2i64_v8i64__5_u: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v2i64_v8i64__5_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -14195,6 +14191,7 @@ define void @s_shuffle_v2i64_v8i64__7_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s8, s18 ; GFX900-NEXT: s_mov_b32 s9, s19 ; GFX900-NEXT: ;;#ASMSTART @@ -14208,6 +14205,7 @@ define void @s_shuffle_v2i64_v8i64__7_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s8, s18 ; GFX90A-NEXT: s_mov_b32 s9, s19 ; GFX90A-NEXT: ;;#ASMSTART @@ -14221,6 +14219,7 @@ define void @s_shuffle_v2i64_v8i64__7_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_mov_b32 s8, s14 ; GFX942-NEXT: s_mov_b32 s9, s15 ; GFX942-NEXT: ;;#ASMSTART @@ -14254,6 +14253,7 @@ define void @s_shuffle_v2i64_v8i64__9_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -14267,6 +14267,7 @@ define void @s_shuffle_v2i64_v8i64__9_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -14280,6 +14281,7 @@ define void @s_shuffle_v2i64_v8i64__9_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_mov_b32 s8, s2 ; GFX942-NEXT: s_mov_b32 s9, s3 ; GFX942-NEXT: ;;#ASMSTART @@ -14339,10 +14341,11 @@ define void @s_shuffle_v2i64_v8i64__11_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ; def s[8:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -14352,10 +14355,11 @@ define void @s_shuffle_v2i64_v8i64__11_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ; def s[8:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -14367,6 +14371,7 @@ define void @s_shuffle_v2i64_v8i64__11_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 ; GFX942-NEXT: ;;#ASMSTART @@ -14426,44 +14431,19 @@ define void @s_shuffle_v2i64_v8i64__12_u() { } define void @s_shuffle_v2i64_v8i64__13_u() { -; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_u: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v2i64_v8i64__13_u: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v2i64_v8i64__13_u: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v2i64_v8i64__13_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -14524,6 +14504,7 @@ define void @s_shuffle_v2i64_v8i64__15_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s8, s18 ; GFX900-NEXT: s_mov_b32 s9, s19 ; GFX900-NEXT: ;;#ASMSTART @@ -14537,6 +14518,7 @@ define void @s_shuffle_v2i64_v8i64__15_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s8, s18 ; GFX90A-NEXT: s_mov_b32 s9, s19 ; GFX90A-NEXT: ;;#ASMSTART @@ -14550,6 +14532,7 @@ define void @s_shuffle_v2i64_v8i64__15_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_mov_b32 s8, s14 ; GFX942-NEXT: s_mov_b32 s9, s15 ; GFX942-NEXT: ;;#ASMSTART @@ -15560,6 +15543,7 @@ define void @s_shuffle_v2i64_v8i64__u_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -15573,6 +15557,7 @@ define void @s_shuffle_v2i64_v8i64__u_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -15586,6 +15571,7 @@ define void @s_shuffle_v2i64_v8i64__u_0() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_mov_b32 s10, s0 ; GFX942-NEXT: s_mov_b32 s11, s1 ; GFX942-NEXT: ;;#ASMSTART @@ -15947,6 +15933,7 @@ define void @s_shuffle_v2i64_v8i64__8_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -15960,6 +15947,7 @@ define void @s_shuffle_v2i64_v8i64__8_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -15973,6 +15961,7 @@ define void @s_shuffle_v2i64_v8i64__8_0() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_mov_b32 s10, s0 ; GFX942-NEXT: s_mov_b32 s11, s1 ; GFX942-NEXT: ;;#ASMSTART @@ -17127,10 +17116,11 @@ define void @s_shuffle_v2i64_v8i64__u_2() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ; def s[8:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s8 -; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -17140,10 +17130,11 @@ define void @s_shuffle_v2i64_v8i64__u_2() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ; def s[8:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s8 -; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -17155,6 +17146,7 @@ define void @s_shuffle_v2i64_v8i64__u_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_mov_b32 s10, s4 ; GFX942-NEXT: s_mov_b32 s11, s5 ; GFX942-NEXT: ;;#ASMSTART @@ -17514,10 +17506,11 @@ define void @s_shuffle_v2i64_v8i64__8_2() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ; def s[8:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s8 -; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -17527,10 +17520,11 @@ define void @s_shuffle_v2i64_v8i64__8_2() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ; def s[8:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s8 -; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -17542,6 +17536,7 @@ define void @s_shuffle_v2i64_v8i64__8_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_mov_b32 s10, s4 ; GFX942-NEXT: s_mov_b32 s11, s5 ; GFX942-NEXT: ;;#ASMSTART @@ -18712,44 +18707,19 @@ define void @s_shuffle_v2i64_v8i64__14_3() { } define void @s_shuffle_v2i64_v8i64__u_4() { -; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_4: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v2i64_v8i64__u_4: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v2i64_v8i64__u_4: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s8 -; GFX942-NEXT: s_mov_b32 s11, s9 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v2i64_v8i64__u_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -18979,44 +18949,19 @@ define void @s_shuffle_v2i64_v8i64__7_4() { } define void @s_shuffle_v2i64_v8i64__8_4() { -; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_4: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_4: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v2i64_v8i64__8_4: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s8 -; GFX942-NEXT: s_mov_b32 s11, s9 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v2i64_v8i64__8_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -20419,6 +20364,7 @@ define void @s_shuffle_v2i64_v8i64__u_6() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s10, s16 ; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: ;;#ASMSTART @@ -20432,6 +20378,7 @@ define void @s_shuffle_v2i64_v8i64__u_6() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s10, s16 ; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: ;;#ASMSTART @@ -20445,6 +20392,7 @@ define void @s_shuffle_v2i64_v8i64__u_6() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART @@ -20806,6 +20754,7 @@ define void @s_shuffle_v2i64_v8i64__8_6() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s10, s16 ; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: ;;#ASMSTART @@ -20819,6 +20768,7 @@ define void @s_shuffle_v2i64_v8i64__8_6() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s10, s16 ; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: ;;#ASMSTART @@ -20832,6 +20782,7 @@ define void @s_shuffle_v2i64_v8i64__8_6() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART @@ -22599,6 +22550,7 @@ define void @s_shuffle_v2i64_v8i64__1_8() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -22612,6 +22564,7 @@ define void @s_shuffle_v2i64_v8i64__1_8() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -22625,6 +22578,7 @@ define void @s_shuffle_v2i64_v8i64__1_8() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_mov_b32 s8, s2 ; GFX942-NEXT: s_mov_b32 s9, s3 ; GFX942-NEXT: ;;#ASMSTART @@ -22682,10 +22636,11 @@ define void @s_shuffle_v2i64_v8i64__3_8() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ; def s[8:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -22695,10 +22650,11 @@ define void @s_shuffle_v2i64_v8i64__3_8() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ; def s[8:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -22710,6 +22666,7 @@ define void @s_shuffle_v2i64_v8i64__3_8() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_mov_b32 s8, s6 ; GFX942-NEXT: s_mov_b32 s9, s7 ; GFX942-NEXT: ;;#ASMSTART @@ -22767,44 +22724,19 @@ define void @s_shuffle_v2i64_v8i64__4_8() { } define void @s_shuffle_v2i64_v8i64__5_8() { -; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_8: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v2i64_v8i64__5_8: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v2i64_v8i64__5_8: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v2i64_v8i64__5_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX9-NEXT: s_mov_b32 s8, s14 +; GFX9-NEXT: s_mov_b32 s9, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> call void asm sideeffect "; use $0", "{s[8:11]}"(<2 x i64> %shuf) @@ -22863,6 +22795,7 @@ define void @s_shuffle_v2i64_v8i64__7_8() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s8, s18 ; GFX900-NEXT: s_mov_b32 s9, s19 ; GFX900-NEXT: ;;#ASMSTART @@ -22876,6 +22809,7 @@ define void @s_shuffle_v2i64_v8i64__7_8() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s8, s18 ; GFX90A-NEXT: s_mov_b32 s9, s19 ; GFX90A-NEXT: ;;#ASMSTART @@ -22889,6 +22823,7 @@ define void @s_shuffle_v2i64_v8i64__7_8() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_mov_b32 s8, s14 ; GFX942-NEXT: s_mov_b32 s9, s15 ; GFX942-NEXT: ;;#ASMSTART @@ -24077,10 +24012,11 @@ define void @s_shuffle_v2i64_v8i64__u_10() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ; def s[8:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s8 -; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -24090,10 +24026,11 @@ define void @s_shuffle_v2i64_v8i64__u_10() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ; def s[8:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s8 -; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -24105,6 +24042,7 @@ define void @s_shuffle_v2i64_v8i64__u_10() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_mov_b32 s10, s4 ; GFX942-NEXT: s_mov_b32 s11, s5 ; GFX942-NEXT: ;;#ASMSTART @@ -25933,44 +25871,19 @@ define void @s_shuffle_v2i64_v8i64__14_11() { } define void @s_shuffle_v2i64_v8i64__u_12() { -; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_12: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:19] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use s[8:11] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: s_shuffle_v2i64_v8i64__u_12: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:19] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s[8:11] -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX942-LABEL: s_shuffle_v2i64_v8i64__u_12: -; GFX942: ; %bb.0: -; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:15] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s8 -; GFX942-NEXT: s_mov_b32 s11, s9 -; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; use s[8:11] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: s_shuffle_v2i64_v8i64__u_12: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:19] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX9-NEXT: s_mov_b32 s10, s12 +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i64> asm "; def $0", "=s"() %vec1 = call <8 x i64> asm "; def $0", "=s"() %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> @@ -27792,6 +27705,7 @@ define void @s_shuffle_v2i64_v8i64__u_14() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s10, s16 ; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: ;;#ASMSTART @@ -27805,6 +27719,7 @@ define void @s_shuffle_v2i64_v8i64__u_14() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s10, s16 ; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: ;;#ASMSTART @@ -27818,6 +27733,7 @@ define void @s_shuffle_v2i64_v8i64__u_14() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_mov_b32 s10, s12 ; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll index 7f8f2dbbb09a1..442a0026bb890 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll @@ -57,40 +57,44 @@ define void @v_shuffle_v2p0_v2p0__0_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v2p0_v2p0__1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v2p0__1_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v2p0__1_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v2p0__1_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -113,40 +117,44 @@ define void @v_shuffle_v2p0_v2p0__2_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v2p0_v2p0__3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v2p0__3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v2p0__3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v2p0__3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -374,10 +382,11 @@ define void @v_shuffle_v2p0_v2p0__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -387,10 +396,11 @@ define void @v_shuffle_v2p0_v2p0__u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -400,10 +410,11 @@ define void @v_shuffle_v2p0_v2p0__u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -510,10 +521,11 @@ define void @v_shuffle_v2p0_v2p0__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -523,10 +535,11 @@ define void @v_shuffle_v2p0_v2p0__2_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -536,10 +549,11 @@ define void @v_shuffle_v2p0_v2p0__2_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -763,40 +777,44 @@ define void @v_shuffle_v2p0_v2p0__0_2(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v2p0_v2p0__1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p0_v2p0__1_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p0_v2p0__1_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p0_v2p0__1_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1065,6 +1083,7 @@ define void @s_shuffle_v2p0_v2p0__0_u() { define void @s_shuffle_v2p0_v2p0__1_u() { ; GFX900-LABEL: s_shuffle_v2p0_v2p0__1_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -1078,6 +1097,7 @@ define void @s_shuffle_v2p0_v2p0__1_u() { ; ; GFX90A-LABEL: s_shuffle_v2p0_v2p0__1_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -1091,6 +1111,7 @@ define void @s_shuffle_v2p0_v2p0__1_u() { ; ; GFX942-LABEL: s_shuffle_v2p0_v2p0__1_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -1124,6 +1145,7 @@ define void @s_shuffle_v2p0_v2p0__2_u() { define void @s_shuffle_v2p0_v2p0__3_u() { ; GFX900-LABEL: s_shuffle_v2p0_v2p0__3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -1137,6 +1159,7 @@ define void @s_shuffle_v2p0_v2p0__3_u() { ; ; GFX90A-LABEL: s_shuffle_v2p0_v2p0__3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -1150,6 +1173,7 @@ define void @s_shuffle_v2p0_v2p0__3_u() { ; ; GFX942-LABEL: s_shuffle_v2p0_v2p0__3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -1358,6 +1382,7 @@ define void @s_shuffle_v2p0_v2p0__3_3() { define void @s_shuffle_v2p0_v2p0__u_0() { ; GFX900-LABEL: s_shuffle_v2p0_v2p0__u_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -1371,6 +1396,7 @@ define void @s_shuffle_v2p0_v2p0__u_0() { ; ; GFX90A-LABEL: s_shuffle_v2p0_v2p0__u_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -1384,6 +1410,7 @@ define void @s_shuffle_v2p0_v2p0__u_0() { ; ; GFX942-LABEL: s_shuffle_v2p0_v2p0__u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -1473,6 +1500,7 @@ define void @s_shuffle_v2p0_v2p0__1_0() { define void @s_shuffle_v2p0_v2p0__2_0() { ; GFX900-LABEL: s_shuffle_v2p0_v2p0__2_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -1486,6 +1514,7 @@ define void @s_shuffle_v2p0_v2p0__2_0() { ; ; GFX90A-LABEL: s_shuffle_v2p0_v2p0__2_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -1499,6 +1528,7 @@ define void @s_shuffle_v2p0_v2p0__2_0() { ; ; GFX942-LABEL: s_shuffle_v2p0_v2p0__2_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -1711,6 +1741,7 @@ define void @s_shuffle_v2p0_v2p0__0_2() { define void @s_shuffle_v2p0_v2p0__1_2() { ; GFX900-LABEL: s_shuffle_v2p0_v2p0__1_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -1724,6 +1755,7 @@ define void @s_shuffle_v2p0_v2p0__1_2() { ; ; GFX90A-LABEL: s_shuffle_v2p0_v2p0__1_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -1737,6 +1769,7 @@ define void @s_shuffle_v2p0_v2p0__1_2() { ; ; GFX942-LABEL: s_shuffle_v2p0_v2p0__1_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v3p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v3p0.ll index 27a6cf11c4cb1..2455b7be778b3 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v3p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v3p0.ll @@ -100,6 +100,7 @@ define void @v_shuffle_v2p0_v3p0__2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -113,6 +114,7 @@ define void @v_shuffle_v2p0_v3p0__2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -126,6 +128,7 @@ define void @v_shuffle_v2p0_v3p0__2_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -196,6 +199,7 @@ define void @v_shuffle_v2p0_v3p0__5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -209,6 +213,7 @@ define void @v_shuffle_v2p0_v3p0__5_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -222,6 +227,7 @@ define void @v_shuffle_v2p0_v3p0__5_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -560,10 +566,11 @@ define void @v_shuffle_v2p0_v3p0__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -573,10 +580,11 @@ define void @v_shuffle_v2p0_v3p0__u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -586,10 +594,11 @@ define void @v_shuffle_v2p0_v3p0__u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -746,10 +755,11 @@ define void @v_shuffle_v2p0_v3p0__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -759,10 +769,11 @@ define void @v_shuffle_v2p0_v3p0__3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -772,10 +783,11 @@ define void @v_shuffle_v2p0_v3p0__3_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1462,6 +1474,7 @@ define void @v_shuffle_v2p0_v3p0__2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -1475,6 +1488,7 @@ define void @v_shuffle_v2p0_v3p0__2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -1488,6 +1502,7 @@ define void @v_shuffle_v2p0_v3p0__2_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -2206,6 +2221,7 @@ define void @s_shuffle_v2p0_v3p0__1_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -2219,6 +2235,7 @@ define void @s_shuffle_v2p0_v3p0__1_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -2228,6 +2245,7 @@ define void @s_shuffle_v2p0_v3p0__1_u() { ; ; GFX942-LABEL: s_shuffle_v2p0_v3p0__1_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -2249,8 +2267,11 @@ define void @s_shuffle_v2p0_v3p0__2_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -2260,8 +2281,11 @@ define void @s_shuffle_v2p0_v3p0__2_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -2269,6 +2293,7 @@ define void @s_shuffle_v2p0_v3p0__2_u() { ; ; GFX942-LABEL: s_shuffle_v2p0_v3p0__2_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -2306,6 +2331,7 @@ define void @s_shuffle_v2p0_v3p0__4_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -2319,6 +2345,7 @@ define void @s_shuffle_v2p0_v3p0__4_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -2328,6 +2355,7 @@ define void @s_shuffle_v2p0_v3p0__4_u() { ; ; GFX942-LABEL: s_shuffle_v2p0_v3p0__4_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -2350,8 +2378,11 @@ define void @s_shuffle_v2p0_v3p0__5_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -2361,8 +2392,11 @@ define void @s_shuffle_v2p0_v3p0__5_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -2370,6 +2404,7 @@ define void @s_shuffle_v2p0_v3p0__5_u() { ; ; GFX942-LABEL: s_shuffle_v2p0_v3p0__5_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -2681,6 +2716,7 @@ define void @s_shuffle_v2p0_v3p0__u_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -2694,6 +2730,7 @@ define void @s_shuffle_v2p0_v3p0__u_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -2703,6 +2740,7 @@ define void @s_shuffle_v2p0_v3p0__u_0() { ; ; GFX942-LABEL: s_shuffle_v2p0_v3p0__u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -2843,6 +2881,7 @@ define void @s_shuffle_v2p0_v3p0__3_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -2856,6 +2895,7 @@ define void @s_shuffle_v2p0_v3p0__3_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -2865,6 +2905,7 @@ define void @s_shuffle_v2p0_v3p0__3_0() { ; ; GFX942-LABEL: s_shuffle_v2p0_v3p0__3_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -3160,10 +3201,11 @@ define void @s_shuffle_v2p0_v3p0__u_2() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s8 -; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -3173,10 +3215,11 @@ define void @s_shuffle_v2p0_v3p0__u_2() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s8 -; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -3184,6 +3227,7 @@ define void @s_shuffle_v2p0_v3p0__u_2() { ; ; GFX942-LABEL: s_shuffle_v2p0_v3p0__u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -3326,10 +3370,11 @@ define void @s_shuffle_v2p0_v3p0__3_2() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s8 -; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -3339,10 +3384,11 @@ define void @s_shuffle_v2p0_v3p0__3_2() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s8 -; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -3350,6 +3396,7 @@ define void @s_shuffle_v2p0_v3p0__3_2() { ; ; GFX942-LABEL: s_shuffle_v2p0_v3p0__3_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -3488,6 +3535,7 @@ define void @s_shuffle_v2p0_v3p0__1_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -3501,6 +3549,7 @@ define void @s_shuffle_v2p0_v3p0__1_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -3510,6 +3559,7 @@ define void @s_shuffle_v2p0_v3p0__1_3() { ; ; GFX942-LABEL: s_shuffle_v2p0_v3p0__1_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -3531,8 +3581,11 @@ define void @s_shuffle_v2p0_v3p0__2_3() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -3542,8 +3595,11 @@ define void @s_shuffle_v2p0_v3p0__2_3() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -3551,6 +3607,7 @@ define void @s_shuffle_v2p0_v3p0__2_3() { ; ; GFX942-LABEL: s_shuffle_v2p0_v3p0__2_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -3905,10 +3962,11 @@ define void @s_shuffle_v2p0_v3p0__u_5() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[8:13] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s8 -; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -3918,10 +3976,11 @@ define void @s_shuffle_v2p0_v3p0__u_5() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[8:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s8 -; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -3929,6 +3988,7 @@ define void @s_shuffle_v2p0_v3p0__u_5() { ; ; GFX942-LABEL: s_shuffle_v2p0_v3p0__u_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll index ae31524ebaa7f..59caa3e76c000 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll @@ -139,6 +139,7 @@ define void @v_shuffle_v2p0_v4p0__3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -152,6 +153,7 @@ define void @v_shuffle_v2p0_v4p0__3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -165,6 +167,7 @@ define void @v_shuffle_v2p0_v4p0__3_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -275,6 +278,7 @@ define void @v_shuffle_v2p0_v4p0__7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -288,6 +292,7 @@ define void @v_shuffle_v2p0_v4p0__7_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -301,6 +306,7 @@ define void @v_shuffle_v2p0_v4p0__7_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -741,10 +747,11 @@ define void @v_shuffle_v2p0_v4p0__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -754,10 +761,11 @@ define void @v_shuffle_v2p0_v4p0__u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -767,10 +775,11 @@ define void @v_shuffle_v2p0_v4p0__u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -972,10 +981,11 @@ define void @v_shuffle_v2p0_v4p0__4_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -985,10 +995,11 @@ define void @v_shuffle_v2p0_v4p0__4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -998,10 +1009,11 @@ define void @v_shuffle_v2p0_v4p0__4_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -2349,6 +2361,7 @@ define void @v_shuffle_v2p0_v4p0__3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -2362,6 +2375,7 @@ define void @v_shuffle_v2p0_v4p0__3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -2375,6 +2389,7 @@ define void @v_shuffle_v2p0_v4p0__3_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -3739,6 +3754,7 @@ define void @s_shuffle_v2p0_v4p0__1_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -3752,6 +3768,7 @@ define void @s_shuffle_v2p0_v4p0__1_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -3761,6 +3778,7 @@ define void @s_shuffle_v2p0_v4p0__1_u() { ; ; GFX942-LABEL: s_shuffle_v2p0_v4p0__1_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -3822,10 +3840,11 @@ define void @s_shuffle_v2p0_v4p0__3_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -3835,10 +3854,11 @@ define void @s_shuffle_v2p0_v4p0__3_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -3846,6 +3866,7 @@ define void @s_shuffle_v2p0_v4p0__3_u() { ; ; GFX942-LABEL: s_shuffle_v2p0_v4p0__3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -3883,6 +3904,7 @@ define void @s_shuffle_v2p0_v4p0__5_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -3896,6 +3918,7 @@ define void @s_shuffle_v2p0_v4p0__5_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -3905,6 +3928,7 @@ define void @s_shuffle_v2p0_v4p0__5_u() { ; ; GFX942-LABEL: s_shuffle_v2p0_v4p0__5_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -3968,10 +3992,11 @@ define void @s_shuffle_v2p0_v4p0__7_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -3981,10 +4006,11 @@ define void @s_shuffle_v2p0_v4p0__7_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -3992,6 +4018,7 @@ define void @s_shuffle_v2p0_v4p0__7_u() { ; ; GFX942-LABEL: s_shuffle_v2p0_v4p0__7_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -4394,6 +4421,7 @@ define void @s_shuffle_v2p0_v4p0__u_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -4407,6 +4435,7 @@ define void @s_shuffle_v2p0_v4p0__u_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -4416,6 +4445,7 @@ define void @s_shuffle_v2p0_v4p0__u_0() { ; ; GFX942-LABEL: s_shuffle_v2p0_v4p0__u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -4579,6 +4609,7 @@ define void @s_shuffle_v2p0_v4p0__4_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -4592,6 +4623,7 @@ define void @s_shuffle_v2p0_v4p0__4_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -4601,6 +4633,7 @@ define void @s_shuffle_v2p0_v4p0__4_0() { ; ; GFX942-LABEL: s_shuffle_v2p0_v4p0__4_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -5028,10 +5061,11 @@ define void @s_shuffle_v2p0_v4p0__u_2() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s8 -; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -5041,10 +5075,11 @@ define void @s_shuffle_v2p0_v4p0__u_2() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s8 -; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -5052,6 +5087,7 @@ define void @s_shuffle_v2p0_v4p0__u_2() { ; ; GFX942-LABEL: s_shuffle_v2p0_v4p0__u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -5213,10 +5249,11 @@ define void @s_shuffle_v2p0_v4p0__4_2() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s8 -; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -5226,10 +5263,11 @@ define void @s_shuffle_v2p0_v4p0__4_2() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s8 -; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -5237,6 +5275,7 @@ define void @s_shuffle_v2p0_v4p0__4_2() { ; ; GFX942-LABEL: s_shuffle_v2p0_v4p0__4_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -5720,6 +5759,7 @@ define void @s_shuffle_v2p0_v4p0__1_4() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -5733,6 +5773,7 @@ define void @s_shuffle_v2p0_v4p0__1_4() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -5742,6 +5783,7 @@ define void @s_shuffle_v2p0_v4p0__1_4() { ; ; GFX942-LABEL: s_shuffle_v2p0_v4p0__1_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -5803,10 +5845,11 @@ define void @s_shuffle_v2p0_v4p0__3_4() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -5816,10 +5859,11 @@ define void @s_shuffle_v2p0_v4p0__3_4() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -5827,6 +5871,7 @@ define void @s_shuffle_v2p0_v4p0__3_4() { ; ; GFX942-LABEL: s_shuffle_v2p0_v4p0__3_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -6277,10 +6322,11 @@ define void @s_shuffle_v2p0_v4p0__u_6() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s8 -; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND @@ -6290,10 +6336,11 @@ define void @s_shuffle_v2p0_v4p0__u_6() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s8 -; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND @@ -6301,6 +6348,7 @@ define void @s_shuffle_v2p0_v4p0__u_6() { ; ; GFX942-LABEL: s_shuffle_v2p0_v4p0__u_6: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll index 299dfba482953..9fc76d404b0bb 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll @@ -57,37 +57,41 @@ define void @v_shuffle_v2p3_v2p3__0_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v2p3_v2p3__1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__1_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v2p3__1_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v2p3__1_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -110,37 +114,41 @@ define void @v_shuffle_v2p3_v2p3__2_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v2p3_v2p3__3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v2p3__3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v2p3__3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -350,33 +358,37 @@ define void @v_shuffle_v2p3_v2p3__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v2p3__u_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v2p3__u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -476,33 +488,37 @@ define void @v_shuffle_v2p3_v2p3__2_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v2p3__2_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v2p3__2_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -723,37 +739,41 @@ define void @v_shuffle_v2p3_v2p3__0_2(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v2p3_v2p3__1_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v2p3_v2p3__1_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v2p3_v2p3__1_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v2p3_v2p3__1_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1016,6 +1036,7 @@ define void @s_shuffle_v2p3_v2p3__0_u() { define void @s_shuffle_v2p3_v2p3__1_u() { ; GFX900-LABEL: s_shuffle_v2p3_v2p3__1_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -1028,6 +1049,7 @@ define void @s_shuffle_v2p3_v2p3__1_u() { ; ; GFX90A-LABEL: s_shuffle_v2p3_v2p3__1_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -1040,6 +1062,7 @@ define void @s_shuffle_v2p3_v2p3__1_u() { ; ; GFX942-LABEL: s_shuffle_v2p3_v2p3__1_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -1072,6 +1095,7 @@ define void @s_shuffle_v2p3_v2p3__2_u() { define void @s_shuffle_v2p3_v2p3__3_u() { ; GFX900-LABEL: s_shuffle_v2p3_v2p3__3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -1084,6 +1108,7 @@ define void @s_shuffle_v2p3_v2p3__3_u() { ; ; GFX90A-LABEL: s_shuffle_v2p3_v2p3__3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -1096,6 +1121,7 @@ define void @s_shuffle_v2p3_v2p3__3_u() { ; ; GFX942-LABEL: s_shuffle_v2p3_v2p3__3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -1287,6 +1313,7 @@ define void @s_shuffle_v2p3_v2p3__3_3() { define void @s_shuffle_v2p3_v2p3__u_0() { ; GFX900-LABEL: s_shuffle_v2p3_v2p3__u_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -1299,6 +1326,7 @@ define void @s_shuffle_v2p3_v2p3__u_0() { ; ; GFX90A-LABEL: s_shuffle_v2p3_v2p3__u_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -1311,6 +1339,7 @@ define void @s_shuffle_v2p3_v2p3__u_0() { ; ; GFX942-LABEL: s_shuffle_v2p3_v2p3__u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -1392,6 +1421,7 @@ define void @s_shuffle_v2p3_v2p3__1_0() { define void @s_shuffle_v2p3_v2p3__2_0() { ; GFX900-LABEL: s_shuffle_v2p3_v2p3__2_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -1404,6 +1434,7 @@ define void @s_shuffle_v2p3_v2p3__2_0() { ; ; GFX90A-LABEL: s_shuffle_v2p3_v2p3__2_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -1416,6 +1447,7 @@ define void @s_shuffle_v2p3_v2p3__2_0() { ; ; GFX942-LABEL: s_shuffle_v2p3_v2p3__2_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -1626,6 +1658,7 @@ define void @s_shuffle_v2p3_v2p3__0_2() { define void @s_shuffle_v2p3_v2p3__1_2() { ; GFX900-LABEL: s_shuffle_v2p3_v2p3__1_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -1638,6 +1671,7 @@ define void @s_shuffle_v2p3_v2p3__1_2() { ; ; GFX90A-LABEL: s_shuffle_v2p3_v2p3__1_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -1650,6 +1684,7 @@ define void @s_shuffle_v2p3_v2p3__1_2() { ; ; GFX942-LABEL: s_shuffle_v2p3_v2p3__1_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll index 13e3d94c35446..72efe2990ce82 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll @@ -72,9 +72,10 @@ define void @v_shuffle_v2p3_v3p3__1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -84,9 +85,10 @@ define void @v_shuffle_v2p3_v3p3__1_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -102,6 +104,7 @@ define void @v_shuffle_v2p3_v3p3__2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] @@ -114,6 +117,7 @@ define void @v_shuffle_v2p3_v3p3__2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] @@ -126,6 +130,7 @@ define void @v_shuffle_v2p3_v3p3__2_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] @@ -166,9 +171,10 @@ define void @v_shuffle_v2p3_v3p3__4_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -178,9 +184,10 @@ define void @v_shuffle_v2p3_v3p3__4_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -197,6 +204,7 @@ define void @v_shuffle_v2p3_v3p3__5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] @@ -209,6 +217,7 @@ define void @v_shuffle_v2p3_v3p3__5_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] @@ -221,6 +230,7 @@ define void @v_shuffle_v2p3_v3p3__5_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] @@ -536,9 +546,10 @@ define void @v_shuffle_v2p3_v3p3__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -548,9 +559,10 @@ define void @v_shuffle_v2p3_v3p3__u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -560,9 +572,10 @@ define void @v_shuffle_v2p3_v3p3__u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -706,9 +719,10 @@ define void @v_shuffle_v2p3_v3p3__3_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -718,9 +732,10 @@ define void @v_shuffle_v2p3_v3p3__3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -730,9 +745,10 @@ define void @v_shuffle_v2p3_v3p3__3_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1066,6 +1082,7 @@ define void @v_shuffle_v2p3_v3p3__u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] @@ -1078,6 +1095,7 @@ define void @v_shuffle_v2p3_v3p3__u_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] @@ -1236,6 +1254,7 @@ define void @v_shuffle_v2p3_v3p3__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] @@ -1248,6 +1267,7 @@ define void @v_shuffle_v2p3_v3p3__3_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] @@ -1381,9 +1401,10 @@ define void @v_shuffle_v2p3_v3p3__1_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1393,9 +1414,10 @@ define void @v_shuffle_v2p3_v3p3__1_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1411,6 +1433,7 @@ define void @v_shuffle_v2p3_v3p3__2_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] @@ -1423,6 +1446,7 @@ define void @v_shuffle_v2p3_v3p3__2_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] @@ -1435,6 +1459,7 @@ define void @v_shuffle_v2p3_v3p3__2_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] @@ -1800,6 +1825,7 @@ define void @v_shuffle_v2p3_v3p3__u_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] @@ -1812,6 +1838,7 @@ define void @v_shuffle_v2p3_v3p3__u_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] @@ -2129,6 +2156,7 @@ define void @s_shuffle_v2p3_v3p3__0_u() { define void @s_shuffle_v2p3_v3p3__1_u() { ; GFX900-LABEL: s_shuffle_v2p3_v3p3__1_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -2141,6 +2169,7 @@ define void @s_shuffle_v2p3_v3p3__1_u() { ; ; GFX90A-LABEL: s_shuffle_v2p3_v3p3__1_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -2153,6 +2182,7 @@ define void @s_shuffle_v2p3_v3p3__1_u() { ; ; GFX942-LABEL: s_shuffle_v2p3_v3p3__1_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -2171,6 +2201,7 @@ define void @s_shuffle_v2p3_v3p3__1_u() { define void @s_shuffle_v2p3_v3p3__2_u() { ; GFX900-LABEL: s_shuffle_v2p3_v3p3__2_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -2183,6 +2214,7 @@ define void @s_shuffle_v2p3_v3p3__2_u() { ; ; GFX90A-LABEL: s_shuffle_v2p3_v3p3__2_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -2195,6 +2227,7 @@ define void @s_shuffle_v2p3_v3p3__2_u() { ; ; GFX942-LABEL: s_shuffle_v2p3_v3p3__2_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -2227,6 +2260,7 @@ define void @s_shuffle_v2p3_v3p3__3_u() { define void @s_shuffle_v2p3_v3p3__4_u() { ; GFX900-LABEL: s_shuffle_v2p3_v3p3__4_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -2239,6 +2273,7 @@ define void @s_shuffle_v2p3_v3p3__4_u() { ; ; GFX90A-LABEL: s_shuffle_v2p3_v3p3__4_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -2251,6 +2286,7 @@ define void @s_shuffle_v2p3_v3p3__4_u() { ; ; GFX942-LABEL: s_shuffle_v2p3_v3p3__4_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -2270,6 +2306,7 @@ define void @s_shuffle_v2p3_v3p3__4_u() { define void @s_shuffle_v2p3_v3p3__5_u() { ; GFX900-LABEL: s_shuffle_v2p3_v3p3__5_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -2282,6 +2319,7 @@ define void @s_shuffle_v2p3_v3p3__5_u() { ; ; GFX90A-LABEL: s_shuffle_v2p3_v3p3__5_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -2294,6 +2332,7 @@ define void @s_shuffle_v2p3_v3p3__5_u() { ; ; GFX942-LABEL: s_shuffle_v2p3_v3p3__5_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -2586,6 +2625,7 @@ define void @s_shuffle_v2p3_v3p3__5_5() { define void @s_shuffle_v2p3_v3p3__u_0() { ; GFX900-LABEL: s_shuffle_v2p3_v3p3__u_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -2598,6 +2638,7 @@ define void @s_shuffle_v2p3_v3p3__u_0() { ; ; GFX90A-LABEL: s_shuffle_v2p3_v3p3__u_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -2610,6 +2651,7 @@ define void @s_shuffle_v2p3_v3p3__u_0() { ; ; GFX942-LABEL: s_shuffle_v2p3_v3p3__u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -2736,6 +2778,7 @@ define void @s_shuffle_v2p3_v3p3__2_0() { define void @s_shuffle_v2p3_v3p3__3_0() { ; GFX900-LABEL: s_shuffle_v2p3_v3p3__3_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -2748,6 +2791,7 @@ define void @s_shuffle_v2p3_v3p3__3_0() { ; ; GFX90A-LABEL: s_shuffle_v2p3_v3p3__3_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -2760,6 +2804,7 @@ define void @s_shuffle_v2p3_v3p3__3_0() { ; ; GFX942-LABEL: s_shuffle_v2p3_v3p3__3_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -3041,6 +3086,7 @@ define void @s_shuffle_v2p3_v3p3__4_1() { define void @s_shuffle_v2p3_v3p3__u_2() { ; GFX900-LABEL: s_shuffle_v2p3_v3p3__u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -3053,6 +3099,7 @@ define void @s_shuffle_v2p3_v3p3__u_2() { ; ; GFX90A-LABEL: s_shuffle_v2p3_v3p3__u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -3065,6 +3112,7 @@ define void @s_shuffle_v2p3_v3p3__u_2() { ; ; GFX942-LABEL: s_shuffle_v2p3_v3p3__u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -3191,6 +3239,7 @@ define void @s_shuffle_v2p3_v3p3__2_2() { define void @s_shuffle_v2p3_v3p3__3_2() { ; GFX900-LABEL: s_shuffle_v2p3_v3p3__3_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -3203,6 +3252,7 @@ define void @s_shuffle_v2p3_v3p3__3_2() { ; ; GFX90A-LABEL: s_shuffle_v2p3_v3p3__3_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -3215,6 +3265,7 @@ define void @s_shuffle_v2p3_v3p3__3_2() { ; ; GFX942-LABEL: s_shuffle_v2p3_v3p3__3_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -3342,6 +3393,7 @@ define void @s_shuffle_v2p3_v3p3__0_3() { define void @s_shuffle_v2p3_v3p3__1_3() { ; GFX900-LABEL: s_shuffle_v2p3_v3p3__1_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -3354,6 +3406,7 @@ define void @s_shuffle_v2p3_v3p3__1_3() { ; ; GFX90A-LABEL: s_shuffle_v2p3_v3p3__1_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -3366,6 +3419,7 @@ define void @s_shuffle_v2p3_v3p3__1_3() { ; ; GFX942-LABEL: s_shuffle_v2p3_v3p3__1_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -3384,6 +3438,7 @@ define void @s_shuffle_v2p3_v3p3__1_3() { define void @s_shuffle_v2p3_v3p3__2_3() { ; GFX900-LABEL: s_shuffle_v2p3_v3p3__2_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -3396,6 +3451,7 @@ define void @s_shuffle_v2p3_v3p3__2_3() { ; ; GFX90A-LABEL: s_shuffle_v2p3_v3p3__2_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -3408,6 +3464,7 @@ define void @s_shuffle_v2p3_v3p3__2_3() { ; ; GFX942-LABEL: s_shuffle_v2p3_v3p3__2_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -3743,6 +3800,7 @@ define void @s_shuffle_v2p3_v3p3__4_4() { define void @s_shuffle_v2p3_v3p3__u_5() { ; GFX900-LABEL: s_shuffle_v2p3_v3p3__u_5: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -3755,6 +3813,7 @@ define void @s_shuffle_v2p3_v3p3__u_5() { ; ; GFX90A-LABEL: s_shuffle_v2p3_v3p3__u_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -3767,6 +3826,7 @@ define void @s_shuffle_v2p3_v3p3__u_5() { ; ; GFX942-LABEL: s_shuffle_v2p3_v3p3__u_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v4p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v4p3.ll index a9085502c7358..bf3bfaf9e9521 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v4p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v4p3.ll @@ -72,9 +72,10 @@ define void @v_shuffle_v2p3_v4p3__1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -84,9 +85,10 @@ define void @v_shuffle_v2p3_v4p3__1_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -141,6 +143,7 @@ define void @v_shuffle_v2p3_v4p3__3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v3 ; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] @@ -153,6 +156,7 @@ define void @v_shuffle_v2p3_v4p3__3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] @@ -165,6 +169,7 @@ define void @v_shuffle_v2p3_v4p3__3_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -205,9 +210,10 @@ define void @v_shuffle_v2p3_v4p3__5_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -217,9 +223,10 @@ define void @v_shuffle_v2p3_v4p3__5_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -276,6 +283,7 @@ define void @v_shuffle_v2p3_v4p3__7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v3 ; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] @@ -288,6 +296,7 @@ define void @v_shuffle_v2p3_v4p3__7_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] @@ -300,6 +309,7 @@ define void @v_shuffle_v2p3_v4p3__7_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -707,9 +717,10 @@ define void @v_shuffle_v2p3_v4p3__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -719,9 +730,10 @@ define void @v_shuffle_v2p3_v4p3__u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -731,9 +743,10 @@ define void @v_shuffle_v2p3_v4p3__u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -920,9 +933,10 @@ define void @v_shuffle_v2p3_v4p3__4_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -932,9 +946,10 @@ define void @v_shuffle_v2p3_v4p3__4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -944,9 +959,10 @@ define void @v_shuffle_v2p3_v4p3__4_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1428,6 +1444,7 @@ define void @v_shuffle_v2p3_v4p3__u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] @@ -1440,6 +1457,7 @@ define void @v_shuffle_v2p3_v4p3__u_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -1638,6 +1656,7 @@ define void @v_shuffle_v2p3_v4p3__4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] @@ -1650,6 +1669,7 @@ define void @v_shuffle_v2p3_v4p3__4_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -2184,9 +2204,10 @@ define void @v_shuffle_v2p3_v4p3__1_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2196,9 +2217,10 @@ define void @v_shuffle_v2p3_v4p3__1_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2253,6 +2275,7 @@ define void @v_shuffle_v2p3_v4p3__3_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v3 ; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] @@ -2265,6 +2288,7 @@ define void @v_shuffle_v2p3_v4p3__3_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] @@ -2277,6 +2301,7 @@ define void @v_shuffle_v2p3_v4p3__3_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -2781,6 +2806,7 @@ define void @v_shuffle_v2p3_v4p3__u_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] @@ -2793,6 +2819,7 @@ define void @v_shuffle_v2p3_v4p3__u_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -3582,6 +3609,7 @@ define void @s_shuffle_v2p3_v4p3__0_u() { define void @s_shuffle_v2p3_v4p3__1_u() { ; GFX900-LABEL: s_shuffle_v2p3_v4p3__1_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -3594,6 +3622,7 @@ define void @s_shuffle_v2p3_v4p3__1_u() { ; ; GFX90A-LABEL: s_shuffle_v2p3_v4p3__1_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -3606,6 +3635,7 @@ define void @s_shuffle_v2p3_v4p3__1_u() { ; ; GFX942-LABEL: s_shuffle_v2p3_v4p3__1_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -3666,6 +3696,7 @@ define void @s_shuffle_v2p3_v4p3__2_u() { define void @s_shuffle_v2p3_v4p3__3_u() { ; GFX900-LABEL: s_shuffle_v2p3_v4p3__3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -3678,6 +3709,7 @@ define void @s_shuffle_v2p3_v4p3__3_u() { ; ; GFX90A-LABEL: s_shuffle_v2p3_v4p3__3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -3690,6 +3722,7 @@ define void @s_shuffle_v2p3_v4p3__3_u() { ; ; GFX942-LABEL: s_shuffle_v2p3_v4p3__3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -3722,6 +3755,7 @@ define void @s_shuffle_v2p3_v4p3__4_u() { define void @s_shuffle_v2p3_v4p3__5_u() { ; GFX900-LABEL: s_shuffle_v2p3_v4p3__5_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -3734,6 +3768,7 @@ define void @s_shuffle_v2p3_v4p3__5_u() { ; ; GFX90A-LABEL: s_shuffle_v2p3_v4p3__5_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -3746,6 +3781,7 @@ define void @s_shuffle_v2p3_v4p3__5_u() { ; ; GFX942-LABEL: s_shuffle_v2p3_v4p3__5_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -3808,6 +3844,7 @@ define void @s_shuffle_v2p3_v4p3__6_u() { define void @s_shuffle_v2p3_v4p3__7_u() { ; GFX900-LABEL: s_shuffle_v2p3_v4p3__7_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -3820,6 +3857,7 @@ define void @s_shuffle_v2p3_v4p3__7_u() { ; ; GFX90A-LABEL: s_shuffle_v2p3_v4p3__7_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -3832,6 +3870,7 @@ define void @s_shuffle_v2p3_v4p3__7_u() { ; ; GFX942-LABEL: s_shuffle_v2p3_v4p3__7_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -4225,6 +4264,7 @@ define void @s_shuffle_v2p3_v4p3__7_7() { define void @s_shuffle_v2p3_v4p3__u_0() { ; GFX900-LABEL: s_shuffle_v2p3_v4p3__u_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -4237,6 +4277,7 @@ define void @s_shuffle_v2p3_v4p3__u_0() { ; ; GFX90A-LABEL: s_shuffle_v2p3_v4p3__u_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -4249,6 +4290,7 @@ define void @s_shuffle_v2p3_v4p3__u_0() { ; ; GFX942-LABEL: s_shuffle_v2p3_v4p3__u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -4420,6 +4462,7 @@ define void @s_shuffle_v2p3_v4p3__3_0() { define void @s_shuffle_v2p3_v4p3__4_0() { ; GFX900-LABEL: s_shuffle_v2p3_v4p3__4_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -4432,6 +4475,7 @@ define void @s_shuffle_v2p3_v4p3__4_0() { ; ; GFX90A-LABEL: s_shuffle_v2p3_v4p3__4_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -4444,6 +4488,7 @@ define void @s_shuffle_v2p3_v4p3__4_0() { ; ; GFX942-LABEL: s_shuffle_v2p3_v4p3__4_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -4880,6 +4925,7 @@ define void @s_shuffle_v2p3_v4p3__6_1() { define void @s_shuffle_v2p3_v4p3__u_2() { ; GFX900-LABEL: s_shuffle_v2p3_v4p3__u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -4892,6 +4938,7 @@ define void @s_shuffle_v2p3_v4p3__u_2() { ; ; GFX90A-LABEL: s_shuffle_v2p3_v4p3__u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -4904,6 +4951,7 @@ define void @s_shuffle_v2p3_v4p3__u_2() { ; ; GFX942-LABEL: s_shuffle_v2p3_v4p3__u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -5075,6 +5123,7 @@ define void @s_shuffle_v2p3_v4p3__3_2() { define void @s_shuffle_v2p3_v4p3__4_2() { ; GFX900-LABEL: s_shuffle_v2p3_v4p3__4_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -5087,6 +5136,7 @@ define void @s_shuffle_v2p3_v4p3__4_2() { ; ; GFX90A-LABEL: s_shuffle_v2p3_v4p3__4_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -5099,6 +5149,7 @@ define void @s_shuffle_v2p3_v4p3__4_2() { ; ; GFX942-LABEL: s_shuffle_v2p3_v4p3__4_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -5625,6 +5676,7 @@ define void @s_shuffle_v2p3_v4p3__0_4() { define void @s_shuffle_v2p3_v4p3__1_4() { ; GFX900-LABEL: s_shuffle_v2p3_v4p3__1_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -5637,6 +5689,7 @@ define void @s_shuffle_v2p3_v4p3__1_4() { ; ; GFX90A-LABEL: s_shuffle_v2p3_v4p3__1_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -5649,6 +5702,7 @@ define void @s_shuffle_v2p3_v4p3__1_4() { ; ; GFX942-LABEL: s_shuffle_v2p3_v4p3__1_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -5709,6 +5763,7 @@ define void @s_shuffle_v2p3_v4p3__2_4() { define void @s_shuffle_v2p3_v4p3__3_4() { ; GFX900-LABEL: s_shuffle_v2p3_v4p3__3_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -5721,6 +5776,7 @@ define void @s_shuffle_v2p3_v4p3__3_4() { ; ; GFX90A-LABEL: s_shuffle_v2p3_v4p3__3_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -5733,6 +5789,7 @@ define void @s_shuffle_v2p3_v4p3__3_4() { ; ; GFX942-LABEL: s_shuffle_v2p3_v4p3__3_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -6215,6 +6272,7 @@ define void @s_shuffle_v2p3_v4p3__6_5() { define void @s_shuffle_v2p3_v4p3__u_6() { ; GFX900-LABEL: s_shuffle_v2p3_v4p3__u_6: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -6227,6 +6285,7 @@ define void @s_shuffle_v2p3_v4p3__u_6() { ; ; GFX90A-LABEL: s_shuffle_v2p3_v4p3__u_6: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -6239,6 +6298,7 @@ define void @s_shuffle_v2p3_v4p3__u_6() { ; ; GFX942-LABEL: s_shuffle_v2p3_v4p3__u_6: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v8p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v8p3.ll index 9174e92cd9c82..8bf6cd54b5d0f 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v8p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v8p3.ll @@ -72,9 +72,10 @@ define void @v_shuffle_v2p3_v8p3__1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -84,9 +85,10 @@ define void @v_shuffle_v2p3_v8p3__1_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -152,6 +154,7 @@ define void @v_shuffle_v2p3_v8p3__3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -164,6 +167,7 @@ define void @v_shuffle_v2p3_v8p3__3_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -232,6 +236,7 @@ define void @v_shuffle_v2p3_v8p3__5_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v5 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -244,6 +249,7 @@ define void @v_shuffle_v2p3_v8p3__5_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v5 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -301,6 +307,7 @@ define void @v_shuffle_v2p3_v8p3__7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v7 ; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -313,6 +320,7 @@ define void @v_shuffle_v2p3_v8p3__7_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v7 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -325,6 +333,7 @@ define void @v_shuffle_v2p3_v8p3__7_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v7 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -365,9 +374,10 @@ define void @v_shuffle_v2p3_v8p3__9_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -377,9 +387,10 @@ define void @v_shuffle_v2p3_v8p3__9_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -447,6 +458,7 @@ define void @v_shuffle_v2p3_v8p3__11_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -459,6 +471,7 @@ define void @v_shuffle_v2p3_v8p3__11_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -529,6 +542,7 @@ define void @v_shuffle_v2p3_v8p3__13_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v5 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -541,6 +555,7 @@ define void @v_shuffle_v2p3_v8p3__13_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v5 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -600,6 +615,7 @@ define void @v_shuffle_v2p3_v8p3__15_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v7 ; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -612,6 +628,7 @@ define void @v_shuffle_v2p3_v8p3__15_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v7 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -624,6 +641,7 @@ define void @v_shuffle_v2p3_v8p3__15_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v7 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -1419,9 +1437,10 @@ define void @v_shuffle_v2p3_v8p3__u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1431,9 +1450,10 @@ define void @v_shuffle_v2p3_v8p3__u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1443,9 +1463,10 @@ define void @v_shuffle_v2p3_v8p3__u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1804,9 +1825,10 @@ define void @v_shuffle_v2p3_v8p3__8_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1816,9 +1838,10 @@ define void @v_shuffle_v2p3_v8p3__8_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1828,9 +1851,10 @@ define void @v_shuffle_v2p3_v8p3__8_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2904,6 +2928,7 @@ define void @v_shuffle_v2p3_v8p3__u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -2916,6 +2941,7 @@ define void @v_shuffle_v2p3_v8p3__u_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -3286,6 +3312,7 @@ define void @v_shuffle_v2p3_v8p3__8_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -3298,6 +3325,7 @@ define void @v_shuffle_v2p3_v8p3__8_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -4374,6 +4402,7 @@ define void @v_shuffle_v2p3_v8p3__u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -4386,6 +4415,7 @@ define void @v_shuffle_v2p3_v8p3__u_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -4756,6 +4786,7 @@ define void @v_shuffle_v2p3_v8p3__8_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -4768,6 +4799,7 @@ define void @v_shuffle_v2p3_v8p3__8_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -5844,6 +5876,7 @@ define void @v_shuffle_v2p3_v8p3__u_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -5856,6 +5889,7 @@ define void @v_shuffle_v2p3_v8p3__u_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -6226,6 +6260,7 @@ define void @v_shuffle_v2p3_v8p3__8_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -6238,6 +6273,7 @@ define void @v_shuffle_v2p3_v8p3__8_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -7364,9 +7400,10 @@ define void @v_shuffle_v2p3_v8p3__1_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7376,9 +7413,10 @@ define void @v_shuffle_v2p3_v8p3__1_8(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7444,6 +7482,7 @@ define void @v_shuffle_v2p3_v8p3__3_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -7456,6 +7495,7 @@ define void @v_shuffle_v2p3_v8p3__3_8(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -7524,6 +7564,7 @@ define void @v_shuffle_v2p3_v8p3__5_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v5 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -7536,6 +7577,7 @@ define void @v_shuffle_v2p3_v8p3__5_8(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v5 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -7593,6 +7635,7 @@ define void @v_shuffle_v2p3_v8p3__7_8(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v7 ; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -7605,6 +7648,7 @@ define void @v_shuffle_v2p3_v8p3__7_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v7 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -7617,6 +7661,7 @@ define void @v_shuffle_v2p3_v8p3__7_8(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v7 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -8681,6 +8726,7 @@ define void @v_shuffle_v2p3_v8p3__u_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -8693,6 +8739,7 @@ define void @v_shuffle_v2p3_v8p3__u_10(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v2 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -10213,6 +10260,7 @@ define void @v_shuffle_v2p3_v8p3__u_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -10225,6 +10273,7 @@ define void @v_shuffle_v2p3_v8p3__u_12(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -11745,6 +11794,7 @@ define void @v_shuffle_v2p3_v8p3__u_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] @@ -11757,6 +11807,7 @@ define void @v_shuffle_v2p3_v8p3__u_14(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] @@ -13322,6 +13373,7 @@ define void @s_shuffle_v2p3_v8p3__1_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s8, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -13334,6 +13386,7 @@ define void @s_shuffle_v2p3_v8p3__1_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s8, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -13342,6 +13395,7 @@ define void @s_shuffle_v2p3_v8p3__1_u() { ; ; GFX942-LABEL: s_shuffle_v2p3_v8p3__1_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13406,6 +13460,7 @@ define void @s_shuffle_v2p3_v8p3__3_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s8, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -13418,6 +13473,7 @@ define void @s_shuffle_v2p3_v8p3__3_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s8, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -13426,6 +13482,7 @@ define void @s_shuffle_v2p3_v8p3__3_u() { ; ; GFX942-LABEL: s_shuffle_v2p3_v8p3__3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13486,9 +13543,10 @@ define void @s_shuffle_v2p3_v8p3__5_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s9 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX900-NEXT: s_mov_b32 s8, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -13498,9 +13556,10 @@ define void @s_shuffle_v2p3_v8p3__5_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s9 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX90A-NEXT: s_mov_b32 s8, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -13508,6 +13567,7 @@ define void @s_shuffle_v2p3_v8p3__5_u() { ; ; GFX942-LABEL: s_shuffle_v2p3_v8p3__5_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13572,6 +13632,7 @@ define void @s_shuffle_v2p3_v8p3__7_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s8, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -13584,6 +13645,7 @@ define void @s_shuffle_v2p3_v8p3__7_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s8, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -13592,6 +13654,7 @@ define void @s_shuffle_v2p3_v8p3__7_u() { ; ; GFX942-LABEL: s_shuffle_v2p3_v8p3__7_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13628,6 +13691,7 @@ define void @s_shuffle_v2p3_v8p3__9_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s8, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -13640,6 +13704,7 @@ define void @s_shuffle_v2p3_v8p3__9_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s8, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -13648,6 +13713,7 @@ define void @s_shuffle_v2p3_v8p3__9_u() { ; ; GFX942-LABEL: s_shuffle_v2p3_v8p3__9_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13714,6 +13780,7 @@ define void @s_shuffle_v2p3_v8p3__11_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s8, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -13726,6 +13793,7 @@ define void @s_shuffle_v2p3_v8p3__11_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s8, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -13734,6 +13802,7 @@ define void @s_shuffle_v2p3_v8p3__11_u() { ; ; GFX942-LABEL: s_shuffle_v2p3_v8p3__11_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13796,9 +13865,10 @@ define void @s_shuffle_v2p3_v8p3__13_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s9 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX900-NEXT: s_mov_b32 s8, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -13808,9 +13878,10 @@ define void @s_shuffle_v2p3_v8p3__13_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s9 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX90A-NEXT: s_mov_b32 s8, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -13818,6 +13889,7 @@ define void @s_shuffle_v2p3_v8p3__13_u() { ; ; GFX942-LABEL: s_shuffle_v2p3_v8p3__13_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13884,6 +13956,7 @@ define void @s_shuffle_v2p3_v8p3__15_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s8, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -13896,6 +13969,7 @@ define void @s_shuffle_v2p3_v8p3__15_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s8, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -13904,6 +13978,7 @@ define void @s_shuffle_v2p3_v8p3__15_u() { ; ; GFX942-LABEL: s_shuffle_v2p3_v8p3__15_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -14679,6 +14754,7 @@ define void @s_shuffle_v2p3_v8p3__u_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s9, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -14691,6 +14767,7 @@ define void @s_shuffle_v2p3_v8p3__u_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s9, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -14699,6 +14776,7 @@ define void @s_shuffle_v2p3_v8p3__u_0() { ; ; GFX942-LABEL: s_shuffle_v2p3_v8p3__u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -15027,6 +15105,7 @@ define void @s_shuffle_v2p3_v8p3__8_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s9, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -15039,6 +15118,7 @@ define void @s_shuffle_v2p3_v8p3__8_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s9, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -15047,6 +15127,7 @@ define void @s_shuffle_v2p3_v8p3__8_0() { ; ; GFX942-LABEL: s_shuffle_v2p3_v8p3__8_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -16023,6 +16104,7 @@ define void @s_shuffle_v2p3_v8p3__u_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s9, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -16035,6 +16117,7 @@ define void @s_shuffle_v2p3_v8p3__u_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s9, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -16043,6 +16126,7 @@ define void @s_shuffle_v2p3_v8p3__u_2() { ; ; GFX942-LABEL: s_shuffle_v2p3_v8p3__u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -16371,6 +16455,7 @@ define void @s_shuffle_v2p3_v8p3__8_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s9, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -16383,6 +16468,7 @@ define void @s_shuffle_v2p3_v8p3__8_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s9, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -16391,6 +16477,7 @@ define void @s_shuffle_v2p3_v8p3__8_2() { ; ; GFX942-LABEL: s_shuffle_v2p3_v8p3__8_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -17464,9 +17551,10 @@ define void @s_shuffle_v2p3_v8p3__u_4() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s8 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX900-NEXT: s_mov_b32 s9, s12 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -17476,9 +17564,10 @@ define void @s_shuffle_v2p3_v8p3__u_4() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s8 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX90A-NEXT: s_mov_b32 s9, s12 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -17486,6 +17575,7 @@ define void @s_shuffle_v2p3_v8p3__u_4() { ; ; GFX942-LABEL: s_shuffle_v2p3_v8p3__u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -17812,9 +17902,10 @@ define void @s_shuffle_v2p3_v8p3__8_4() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s8 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX900-NEXT: s_mov_b32 s9, s12 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -17824,9 +17915,10 @@ define void @s_shuffle_v2p3_v8p3__8_4() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s8 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX90A-NEXT: s_mov_b32 s9, s12 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -17834,6 +17926,7 @@ define void @s_shuffle_v2p3_v8p3__8_4() { ; ; GFX942-LABEL: s_shuffle_v2p3_v8p3__8_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -18806,6 +18899,7 @@ define void @s_shuffle_v2p3_v8p3__u_6() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s9, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -18818,6 +18912,7 @@ define void @s_shuffle_v2p3_v8p3__u_6() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s9, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -18826,6 +18921,7 @@ define void @s_shuffle_v2p3_v8p3__u_6() { ; ; GFX942-LABEL: s_shuffle_v2p3_v8p3__u_6: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -19154,6 +19250,7 @@ define void @s_shuffle_v2p3_v8p3__8_6() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s9, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -19166,6 +19263,7 @@ define void @s_shuffle_v2p3_v8p3__8_6() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s9, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -19174,6 +19272,7 @@ define void @s_shuffle_v2p3_v8p3__8_6() { ; ; GFX942-LABEL: s_shuffle_v2p3_v8p3__8_6: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -20293,6 +20392,7 @@ define void @s_shuffle_v2p3_v8p3__1_8() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s8, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -20305,6 +20405,7 @@ define void @s_shuffle_v2p3_v8p3__1_8() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s8, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -20313,6 +20414,7 @@ define void @s_shuffle_v2p3_v8p3__1_8() { ; ; GFX942-LABEL: s_shuffle_v2p3_v8p3__1_8: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -20377,6 +20479,7 @@ define void @s_shuffle_v2p3_v8p3__3_8() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s8, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -20389,6 +20492,7 @@ define void @s_shuffle_v2p3_v8p3__3_8() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s8, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -20397,6 +20501,7 @@ define void @s_shuffle_v2p3_v8p3__3_8() { ; ; GFX942-LABEL: s_shuffle_v2p3_v8p3__3_8: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -20457,9 +20562,10 @@ define void @s_shuffle_v2p3_v8p3__5_8() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s9 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX900-NEXT: s_mov_b32 s8, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -20469,9 +20575,10 @@ define void @s_shuffle_v2p3_v8p3__5_8() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s9 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX90A-NEXT: s_mov_b32 s8, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -20479,6 +20586,7 @@ define void @s_shuffle_v2p3_v8p3__5_8() { ; ; GFX942-LABEL: s_shuffle_v2p3_v8p3__5_8: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -20543,6 +20651,7 @@ define void @s_shuffle_v2p3_v8p3__7_8() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s8, s11 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -20555,6 +20664,7 @@ define void @s_shuffle_v2p3_v8p3__7_8() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s8, s11 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -20563,6 +20673,7 @@ define void @s_shuffle_v2p3_v8p3__7_8() { ; ; GFX942-LABEL: s_shuffle_v2p3_v8p3__7_8: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -21522,6 +21633,7 @@ define void @s_shuffle_v2p3_v8p3__u_10() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s9, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -21534,6 +21646,7 @@ define void @s_shuffle_v2p3_v8p3__u_10() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s9, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -21542,6 +21655,7 @@ define void @s_shuffle_v2p3_v8p3__u_10() { ; ; GFX942-LABEL: s_shuffle_v2p3_v8p3__u_10: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -23015,9 +23129,10 @@ define void @s_shuffle_v2p3_v8p3__u_12() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s9, s8 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX900-NEXT: s_mov_b32 s9, s12 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] ; GFX900-NEXT: ;;#ASMEND @@ -23027,9 +23142,10 @@ define void @s_shuffle_v2p3_v8p3__u_12() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s9, s8 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX90A-NEXT: s_mov_b32 s9, s12 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] ; GFX90A-NEXT: ;;#ASMEND @@ -23037,6 +23153,7 @@ define void @s_shuffle_v2p3_v8p3__u_12() { ; ; GFX942-LABEL: s_shuffle_v2p3_v8p3__u_12: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -24442,6 +24559,7 @@ define void @s_shuffle_v2p3_v8p3__u_14() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_mov_b32 s9, s10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:9] @@ -24454,6 +24572,7 @@ define void @s_shuffle_v2p3_v8p3__u_14() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_mov_b32 s9, s10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:9] @@ -24462,6 +24581,7 @@ define void @s_shuffle_v2p3_v8p3__u_14() { ; ; GFX942-LABEL: s_shuffle_v2p3_v8p3__u_14: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll index cd4dbe93e8a11..88d9517c34e1d 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll @@ -4857,6 +4857,7 @@ define void @s_shuffle_v3bf16_v3bf16__1_u_u() { define void @s_shuffle_v3bf16_v3bf16__2_u_u() { ; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__2_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -4869,6 +4870,7 @@ define void @s_shuffle_v3bf16_v3bf16__2_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -4881,6 +4883,7 @@ define void @s_shuffle_v3bf16_v3bf16__2_u_u() { ; ; GFX942-LABEL: s_shuffle_v3bf16_v3bf16__2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -4963,6 +4966,7 @@ define void @s_shuffle_v3bf16_v3bf16__4_u_u() { define void @s_shuffle_v3bf16_v3bf16__5_u_u() { ; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -4975,6 +4979,7 @@ define void @s_shuffle_v3bf16_v3bf16__5_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -4987,6 +4992,7 @@ define void @s_shuffle_v3bf16_v3bf16__5_u_u() { ; ; GFX942-LABEL: s_shuffle_v3bf16_v3bf16__5_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -7451,6 +7457,7 @@ define void @s_shuffle_v3bf16_v3bf16__1_3_3() { define void @s_shuffle_v3bf16_v3bf16__2_3_3() { ; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__2_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -7463,6 +7470,7 @@ define void @s_shuffle_v3bf16_v3bf16__2_3_3() { ; ; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__2_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -7475,6 +7483,7 @@ define void @s_shuffle_v3bf16_v3bf16__2_3_3() { ; ; GFX942-LABEL: s_shuffle_v3bf16_v3bf16__2_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v4bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v4bf16.ll index 311ca98227da3..246f5ead02f22 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v4bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v4bf16.ll @@ -7999,6 +7999,7 @@ define void @s_shuffle_v3bf16_v4bf16__1_u_u() { define void @s_shuffle_v3bf16_v4bf16__2_u_u() { ; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__2_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -8011,6 +8012,7 @@ define void @s_shuffle_v3bf16_v4bf16__2_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -8023,6 +8025,7 @@ define void @s_shuffle_v3bf16_v4bf16__2_u_u() { ; ; GFX942-LABEL: s_shuffle_v3bf16_v4bf16__2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -8144,6 +8147,7 @@ define void @s_shuffle_v3bf16_v4bf16__5_u_u() { define void @s_shuffle_v3bf16_v4bf16__6_u_u() { ; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__6_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -8156,6 +8160,7 @@ define void @s_shuffle_v3bf16_v4bf16__6_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__6_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -8168,6 +8173,7 @@ define void @s_shuffle_v3bf16_v4bf16__6_u_u() { ; ; GFX942-LABEL: s_shuffle_v3bf16_v4bf16__6_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -12442,6 +12448,7 @@ define void @s_shuffle_v3bf16_v4bf16__1_4_4() { define void @s_shuffle_v3bf16_v4bf16__2_4_4() { ; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__2_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -12454,6 +12461,7 @@ define void @s_shuffle_v3bf16_v4bf16__2_4_4() { ; ; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__2_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -12466,6 +12474,7 @@ define void @s_shuffle_v3bf16_v4bf16__2_4_4() { ; ; GFX942-LABEL: s_shuffle_v3bf16_v4bf16__2_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll index 0854ff2ebfc5d..94453d53c9843 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll @@ -4857,6 +4857,7 @@ define void @s_shuffle_v3f16_v3f16__1_u_u() { define void @s_shuffle_v3f16_v3f16__2_u_u() { ; GFX900-LABEL: s_shuffle_v3f16_v3f16__2_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -4869,6 +4870,7 @@ define void @s_shuffle_v3f16_v3f16__2_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3f16_v3f16__2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -4881,6 +4883,7 @@ define void @s_shuffle_v3f16_v3f16__2_u_u() { ; ; GFX942-LABEL: s_shuffle_v3f16_v3f16__2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -4963,6 +4966,7 @@ define void @s_shuffle_v3f16_v3f16__4_u_u() { define void @s_shuffle_v3f16_v3f16__5_u_u() { ; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -4975,6 +4979,7 @@ define void @s_shuffle_v3f16_v3f16__5_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -4987,6 +4992,7 @@ define void @s_shuffle_v3f16_v3f16__5_u_u() { ; ; GFX942-LABEL: s_shuffle_v3f16_v3f16__5_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -7451,6 +7457,7 @@ define void @s_shuffle_v3f16_v3f16__1_3_3() { define void @s_shuffle_v3f16_v3f16__2_3_3() { ; GFX900-LABEL: s_shuffle_v3f16_v3f16__2_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -7463,6 +7470,7 @@ define void @s_shuffle_v3f16_v3f16__2_3_3() { ; ; GFX90A-LABEL: s_shuffle_v3f16_v3f16__2_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -7475,6 +7483,7 @@ define void @s_shuffle_v3f16_v3f16__2_3_3() { ; ; GFX942-LABEL: s_shuffle_v3f16_v3f16__2_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v4f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v4f16.ll index ecc7ff618932b..3907643c47561 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v4f16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v4f16.ll @@ -7999,6 +7999,7 @@ define void @s_shuffle_v3f16_v4f16__1_u_u() { define void @s_shuffle_v3f16_v4f16__2_u_u() { ; GFX900-LABEL: s_shuffle_v3f16_v4f16__2_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -8011,6 +8012,7 @@ define void @s_shuffle_v3f16_v4f16__2_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3f16_v4f16__2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -8023,6 +8025,7 @@ define void @s_shuffle_v3f16_v4f16__2_u_u() { ; ; GFX942-LABEL: s_shuffle_v3f16_v4f16__2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -8144,6 +8147,7 @@ define void @s_shuffle_v3f16_v4f16__5_u_u() { define void @s_shuffle_v3f16_v4f16__6_u_u() { ; GFX900-LABEL: s_shuffle_v3f16_v4f16__6_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -8156,6 +8160,7 @@ define void @s_shuffle_v3f16_v4f16__6_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3f16_v4f16__6_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -8168,6 +8173,7 @@ define void @s_shuffle_v3f16_v4f16__6_u_u() { ; ; GFX942-LABEL: s_shuffle_v3f16_v4f16__6_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -12442,6 +12448,7 @@ define void @s_shuffle_v3f16_v4f16__1_4_4() { define void @s_shuffle_v3f16_v4f16__2_4_4() { ; GFX900-LABEL: s_shuffle_v3f16_v4f16__2_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -12454,6 +12461,7 @@ define void @s_shuffle_v3f16_v4f16__2_4_4() { ; ; GFX90A-LABEL: s_shuffle_v3f16_v4f16__2_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -12466,6 +12474,7 @@ define void @s_shuffle_v3f16_v4f16__2_4_4() { ; ; GFX942-LABEL: s_shuffle_v3f16_v4f16__2_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll index 430f64164d24f..264963bf3443c 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll @@ -59,11 +59,12 @@ define void @v_shuffle_v3f32_v2f32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -71,11 +72,12 @@ define void @v_shuffle_v3f32_v2f32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -83,11 +85,12 @@ define void @v_shuffle_v3f32_v2f32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx3 v2, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -112,11 +115,12 @@ define void @v_shuffle_v3f32_v2f32__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -124,11 +128,12 @@ define void @v_shuffle_v3f32_v2f32__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -136,11 +141,12 @@ define void @v_shuffle_v3f32_v2f32__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx3 v2, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -170,15 +176,16 @@ define void @v_shuffle_v3f32_v2f32__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -186,15 +193,16 @@ define void @v_shuffle_v3f32_v2f32__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -272,28 +280,30 @@ define void @v_shuffle_v3f32_v2f32__3_2_u(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_2_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_2_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -560,26 +570,29 @@ define void @v_shuffle_v3f32_v2f32__u_0_0(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__u_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__u_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -695,26 +708,29 @@ define void @v_shuffle_v3f32_v2f32__2_0_0(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__2_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__2_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -784,15 +800,16 @@ define void @v_shuffle_v3f32_v2f32__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -800,14 +817,16 @@ define void @v_shuffle_v3f32_v2f32__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -815,14 +834,16 @@ define void @v_shuffle_v3f32_v2f32__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1192,15 +1213,16 @@ define void @v_shuffle_v3f32_v2f32__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1208,15 +1230,16 @@ define void @v_shuffle_v3f32_v2f32__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1394,11 +1417,12 @@ define void @v_shuffle_v3f32_v2f32__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1406,11 +1430,12 @@ define void @v_shuffle_v3f32_v2f32__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1418,11 +1443,12 @@ define void @v_shuffle_v3f32_v2f32__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx3 v2, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1492,39 +1518,44 @@ define void @v_shuffle_v3f32_v2f32__3_2_2(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3f32_v2f32__3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: global_store_dwordx3 v1, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v2f32__3_u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: global_store_dwordx3 v1, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1857,12 +1888,13 @@ define void @v_shuffle_v3f32_v2f32__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1870,12 +1902,13 @@ define void @v_shuffle_v3f32_v2f32__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -2101,6 +2134,7 @@ define void @s_shuffle_v3f32_v2f32__0_u_u() { define void @s_shuffle_v3f32_v2f32__1_u_u() { ; GFX900-LABEL: s_shuffle_v3f32_v2f32__1_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -2113,6 +2147,7 @@ define void @s_shuffle_v3f32_v2f32__1_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v2f32__1_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -2125,6 +2160,7 @@ define void @s_shuffle_v3f32_v2f32__1_u_u() { ; ; GFX942-LABEL: s_shuffle_v3f32_v2f32__1_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -2157,6 +2193,7 @@ define void @s_shuffle_v3f32_v2f32__2_u_u() { define void @s_shuffle_v3f32_v2f32__3_u_u() { ; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -2169,6 +2206,7 @@ define void @s_shuffle_v3f32_v2f32__3_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -2181,6 +2219,7 @@ define void @s_shuffle_v3f32_v2f32__3_u_u() { ; ; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -2200,6 +2239,7 @@ define void @s_shuffle_v3f32_v2f32__3_u_u() { define void @s_shuffle_v3f32_v2f32__3_0_u() { ; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_0_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -2216,6 +2256,7 @@ define void @s_shuffle_v3f32_v2f32__3_0_u() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_0_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -2232,6 +2273,7 @@ define void @s_shuffle_v3f32_v2f32__3_0_u() { ; ; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_0_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -2307,6 +2349,7 @@ define void @s_shuffle_v3f32_v2f32__3_1_u() { define void @s_shuffle_v3f32_v2f32__3_2_u() { ; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_2_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -2320,6 +2363,7 @@ define void @s_shuffle_v3f32_v2f32__3_2_u() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_2_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -2333,6 +2377,7 @@ define void @s_shuffle_v3f32_v2f32__3_2_u() { ; ; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_2_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -2551,6 +2596,7 @@ define void @s_shuffle_v3f32_v2f32__3_3_3() { define void @s_shuffle_v3f32_v2f32__u_0_0() { ; GFX900-LABEL: s_shuffle_v3f32_v2f32__u_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -2564,6 +2610,7 @@ define void @s_shuffle_v3f32_v2f32__u_0_0() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v2f32__u_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -2577,6 +2624,7 @@ define void @s_shuffle_v3f32_v2f32__u_0_0() { ; ; GFX942-LABEL: s_shuffle_v3f32_v2f32__u_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -2663,6 +2711,7 @@ define void @s_shuffle_v3f32_v2f32__1_0_0() { define void @s_shuffle_v3f32_v2f32__2_0_0() { ; GFX900-LABEL: s_shuffle_v3f32_v2f32__2_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -2676,6 +2725,7 @@ define void @s_shuffle_v3f32_v2f32__2_0_0() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v2f32__2_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -2689,6 +2739,7 @@ define void @s_shuffle_v3f32_v2f32__2_0_0() { ; ; GFX942-LABEL: s_shuffle_v3f32_v2f32__2_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -2766,6 +2817,7 @@ define void @s_shuffle_v3f32_v2f32__3_0_0() { define void @s_shuffle_v3f32_v2f32__3_u_0() { ; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_u_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -2782,6 +2834,7 @@ define void @s_shuffle_v3f32_v2f32__3_u_0() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_u_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -2798,6 +2851,7 @@ define void @s_shuffle_v3f32_v2f32__3_u_0() { ; ; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -3065,6 +3119,7 @@ define void @s_shuffle_v3f32_v2f32__3_1_1() { define void @s_shuffle_v3f32_v2f32__3_u_1() { ; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_u_1: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -3081,6 +3136,7 @@ define void @s_shuffle_v3f32_v2f32__3_u_1() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_u_1: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -3097,6 +3153,7 @@ define void @s_shuffle_v3f32_v2f32__3_u_1() { ; ; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_u_1: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -3290,6 +3347,7 @@ define void @s_shuffle_v3f32_v2f32__0_2_2() { define void @s_shuffle_v3f32_v2f32__1_2_2() { ; GFX900-LABEL: s_shuffle_v3f32_v2f32__1_2_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -3302,6 +3360,7 @@ define void @s_shuffle_v3f32_v2f32__1_2_2() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v2f32__1_2_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -3314,6 +3373,7 @@ define void @s_shuffle_v3f32_v2f32__1_2_2() { ; ; GFX942-LABEL: s_shuffle_v3f32_v2f32__1_2_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -3395,6 +3455,7 @@ define void @s_shuffle_v3f32_v2f32__3_2_2() { define void @s_shuffle_v3f32_v2f32__3_u_2() { ; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -3408,6 +3469,7 @@ define void @s_shuffle_v3f32_v2f32__3_u_2() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -3421,6 +3483,7 @@ define void @s_shuffle_v3f32_v2f32__3_u_2() { ; ; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -3702,6 +3765,7 @@ define void @s_shuffle_v3f32_v2f32__2_3_3() { define void @s_shuffle_v3f32_v2f32__3_u_3() { ; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_u_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -3715,6 +3779,7 @@ define void @s_shuffle_v3f32_v2f32__3_u_3() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_u_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -3728,6 +3793,7 @@ define void @s_shuffle_v3f32_v2f32__3_u_3() { ; ; GFX942-LABEL: s_shuffle_v3f32_v2f32__3_u_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll index ef670e963bdb6..7cc913cf5fcf6 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll @@ -61,9 +61,10 @@ define void @v_shuffle_v3f32_v3f32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -73,9 +74,10 @@ define void @v_shuffle_v3f32_v3f32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -85,9 +87,10 @@ define void @v_shuffle_v3f32_v3f32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -99,37 +102,41 @@ define void @v_shuffle_v3f32_v3f32__1_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3f32_v3f32__2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__2_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -156,9 +163,10 @@ define void @v_shuffle_v3f32_v3f32__4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -168,9 +176,10 @@ define void @v_shuffle_v3f32_v3f32__4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -180,9 +189,10 @@ define void @v_shuffle_v3f32_v3f32__4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -195,37 +205,41 @@ define void @v_shuffle_v3f32_v3f32__4_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3f32_v3f32__5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -242,13 +256,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -258,13 +273,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -274,14 +290,14 @@ define void @v_shuffle_v3f32_v3f32__5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8 ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -347,49 +363,53 @@ define void @v_shuffle_v3f32_v3f32__5_1_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3f32_v3f32__5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_2_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx3 v9, v[6:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_2_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[8:10], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_2_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx3 v3, v[8:10], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -402,39 +422,44 @@ define void @v_shuffle_v3f32_v3f32__5_2_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3f32_v3f32__5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -490,40 +515,44 @@ define void @v_shuffle_v3f32_v3f32__5_4_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3f32_v3f32__5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_5_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_5_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_5_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -859,10 +888,11 @@ define void @v_shuffle_v3f32_v3f32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -872,10 +902,11 @@ define void @v_shuffle_v3f32_v3f32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -885,10 +916,11 @@ define void @v_shuffle_v3f32_v3f32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1043,10 +1075,11 @@ define void @v_shuffle_v3f32_v3f32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1056,10 +1089,11 @@ define void @v_shuffle_v3f32_v3f32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1069,10 +1103,11 @@ define void @v_shuffle_v3f32_v3f32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1206,13 +1241,14 @@ define void @v_shuffle_v3f32_v3f32__5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1222,13 +1258,14 @@ define void @v_shuffle_v3f32_v3f32__5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1238,14 +1275,14 @@ define void @v_shuffle_v3f32_v3f32__5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8 ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1818,13 +1855,14 @@ define void @v_shuffle_v3f32_v3f32__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1834,13 +1872,14 @@ define void @v_shuffle_v3f32_v3f32__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1850,14 +1889,14 @@ define void @v_shuffle_v3f32_v3f32__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8 ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2754,9 +2793,10 @@ define void @v_shuffle_v3f32_v3f32__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2766,9 +2806,10 @@ define void @v_shuffle_v3f32_v3f32__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2778,9 +2819,10 @@ define void @v_shuffle_v3f32_v3f32__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2792,37 +2834,41 @@ define void @v_shuffle_v3f32_v3f32__1_3_3(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3f32_v3f32__2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__2_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__2_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__2_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2941,39 +2987,44 @@ define void @v_shuffle_v3f32_v3f32__5_3_3(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3f32_v3f32__5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_u_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_u_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_u_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3544,40 +3595,44 @@ define void @v_shuffle_v3f32_v3f32__5_4_4(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3f32_v3f32__5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_u_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v3f32__5_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4462,6 +4517,7 @@ define void @s_shuffle_v3f32_v3f32__0_u_u() { define void @s_shuffle_v3f32_v3f32__1_u_u() { ; GFX900-LABEL: s_shuffle_v3f32_v3f32__1_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -4474,6 +4530,7 @@ define void @s_shuffle_v3f32_v3f32__1_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v3f32__1_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -4486,6 +4543,7 @@ define void @s_shuffle_v3f32_v3f32__1_u_u() { ; ; GFX942-LABEL: s_shuffle_v3f32_v3f32__1_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -4504,6 +4562,7 @@ define void @s_shuffle_v3f32_v3f32__1_u_u() { define void @s_shuffle_v3f32_v3f32__2_u_u() { ; GFX900-LABEL: s_shuffle_v3f32_v3f32__2_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -4516,6 +4575,7 @@ define void @s_shuffle_v3f32_v3f32__2_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v3f32__2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -4528,6 +4588,7 @@ define void @s_shuffle_v3f32_v3f32__2_u_u() { ; ; GFX942-LABEL: s_shuffle_v3f32_v3f32__2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -4560,6 +4621,7 @@ define void @s_shuffle_v3f32_v3f32__3_u_u() { define void @s_shuffle_v3f32_v3f32__4_u_u() { ; GFX900-LABEL: s_shuffle_v3f32_v3f32__4_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -4572,6 +4634,7 @@ define void @s_shuffle_v3f32_v3f32__4_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v3f32__4_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -4584,6 +4647,7 @@ define void @s_shuffle_v3f32_v3f32__4_u_u() { ; ; GFX942-LABEL: s_shuffle_v3f32_v3f32__4_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -4603,6 +4667,7 @@ define void @s_shuffle_v3f32_v3f32__4_u_u() { define void @s_shuffle_v3f32_v3f32__5_u_u() { ; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -4615,6 +4680,7 @@ define void @s_shuffle_v3f32_v3f32__5_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -4627,6 +4693,7 @@ define void @s_shuffle_v3f32_v3f32__5_u_u() { ; ; GFX942-LABEL: s_shuffle_v3f32_v3f32__5_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -4646,14 +4713,15 @@ define void @s_shuffle_v3f32_v3f32__5_u_u() { define void @s_shuffle_v3f32_v3f32__5_0_u() { ; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_0_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s9, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] @@ -4662,14 +4730,15 @@ define void @s_shuffle_v3f32_v3f32__5_0_u() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_0_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s9, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] @@ -4678,6 +4747,7 @@ define void @s_shuffle_v3f32_v3f32__5_0_u() { ; ; GFX942-LABEL: s_shuffle_v3f32_v3f32__5_0_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -4753,14 +4823,15 @@ define void @s_shuffle_v3f32_v3f32__5_1_u() { define void @s_shuffle_v3f32_v3f32__5_2_u() { ; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_2_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s9, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] @@ -4769,14 +4840,15 @@ define void @s_shuffle_v3f32_v3f32__5_2_u() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_2_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s9, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] @@ -4785,6 +4857,7 @@ define void @s_shuffle_v3f32_v3f32__5_2_u() { ; ; GFX942-LABEL: s_shuffle_v3f32_v3f32__5_2_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -4808,6 +4881,7 @@ define void @s_shuffle_v3f32_v3f32__5_2_u() { define void @s_shuffle_v3f32_v3f32__5_3_u() { ; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -4821,6 +4895,7 @@ define void @s_shuffle_v3f32_v3f32__5_3_u() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -4834,6 +4909,7 @@ define void @s_shuffle_v3f32_v3f32__5_3_u() { ; ; GFX942-LABEL: s_shuffle_v3f32_v3f32__5_3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -4873,6 +4949,7 @@ define void @s_shuffle_v3f32_v3f32__5_4_u() { define void @s_shuffle_v3f32_v3f32__5_5_u() { ; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_5_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -4886,6 +4963,7 @@ define void @s_shuffle_v3f32_v3f32__5_5_u() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_5_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -4899,6 +4977,7 @@ define void @s_shuffle_v3f32_v3f32__5_5_u() { ; ; GFX942-LABEL: s_shuffle_v3f32_v3f32__5_5_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -5208,6 +5287,7 @@ define void @s_shuffle_v3f32_v3f32__5_5_5() { define void @s_shuffle_v3f32_v3f32__u_0_0() { ; GFX900-LABEL: s_shuffle_v3f32_v3f32__u_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -5221,6 +5301,7 @@ define void @s_shuffle_v3f32_v3f32__u_0_0() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v3f32__u_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -5234,6 +5315,7 @@ define void @s_shuffle_v3f32_v3f32__u_0_0() { ; ; GFX942-LABEL: s_shuffle_v3f32_v3f32__u_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -5368,6 +5450,7 @@ define void @s_shuffle_v3f32_v3f32__2_0_0() { define void @s_shuffle_v3f32_v3f32__3_0_0() { ; GFX900-LABEL: s_shuffle_v3f32_v3f32__3_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -5381,6 +5464,7 @@ define void @s_shuffle_v3f32_v3f32__3_0_0() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v3f32__3_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -5394,6 +5478,7 @@ define void @s_shuffle_v3f32_v3f32__3_0_0() { ; ; GFX942-LABEL: s_shuffle_v3f32_v3f32__3_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -5529,14 +5614,15 @@ define void @s_shuffle_v3f32_v3f32__5_0_0() { define void @s_shuffle_v3f32_v3f32__5_u_0() { ; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_u_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] @@ -5545,14 +5631,15 @@ define void @s_shuffle_v3f32_v3f32__5_u_0() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_u_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] @@ -5561,6 +5648,7 @@ define void @s_shuffle_v3f32_v3f32__5_u_0() { ; ; GFX942-LABEL: s_shuffle_v3f32_v3f32__5_u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -6015,14 +6103,15 @@ define void @s_shuffle_v3f32_v3f32__5_1_1() { define void @s_shuffle_v3f32_v3f32__5_u_1() { ; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_u_1: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s10, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] @@ -6031,14 +6120,15 @@ define void @s_shuffle_v3f32_v3f32__5_u_1() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_u_1: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] @@ -6047,6 +6137,7 @@ define void @s_shuffle_v3f32_v3f32__5_u_1() { ; ; GFX942-LABEL: s_shuffle_v3f32_v3f32__5_u_1: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -6827,6 +6918,7 @@ define void @s_shuffle_v3f32_v3f32__0_3_3() { define void @s_shuffle_v3f32_v3f32__1_3_3() { ; GFX900-LABEL: s_shuffle_v3f32_v3f32__1_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -6839,6 +6931,7 @@ define void @s_shuffle_v3f32_v3f32__1_3_3() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v3f32__1_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -6851,6 +6944,7 @@ define void @s_shuffle_v3f32_v3f32__1_3_3() { ; ; GFX942-LABEL: s_shuffle_v3f32_v3f32__1_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -6869,6 +6963,7 @@ define void @s_shuffle_v3f32_v3f32__1_3_3() { define void @s_shuffle_v3f32_v3f32__2_3_3() { ; GFX900-LABEL: s_shuffle_v3f32_v3f32__2_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -6881,6 +6976,7 @@ define void @s_shuffle_v3f32_v3f32__2_3_3() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v3f32__2_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -6893,6 +6989,7 @@ define void @s_shuffle_v3f32_v3f32__2_3_3() { ; ; GFX942-LABEL: s_shuffle_v3f32_v3f32__2_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7023,6 +7120,7 @@ define void @s_shuffle_v3f32_v3f32__5_3_3() { define void @s_shuffle_v3f32_v3f32__5_u_3() { ; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_u_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -7036,6 +7134,7 @@ define void @s_shuffle_v3f32_v3f32__5_u_3() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_u_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -7049,6 +7148,7 @@ define void @s_shuffle_v3f32_v3f32__5_u_3() { ; ; GFX942-LABEL: s_shuffle_v3f32_v3f32__5_u_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7532,6 +7632,7 @@ define void @s_shuffle_v3f32_v3f32__5_4_4() { define void @s_shuffle_v3f32_v3f32__5_u_4() { ; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_u_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -7545,6 +7646,7 @@ define void @s_shuffle_v3f32_v3f32__5_u_4() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -7558,6 +7660,7 @@ define void @s_shuffle_v3f32_v3f32__5_u_4() { ; ; GFX942-LABEL: s_shuffle_v3f32_v3f32__5_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll index 50c69de069986..8f3644076079b 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll @@ -72,9 +72,10 @@ define void @v_shuffle_v3f32_v4f32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -84,9 +85,10 @@ define void @v_shuffle_v3f32_v4f32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -102,33 +104,37 @@ define void @v_shuffle_v3f32_v4f32__2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -144,6 +150,7 @@ define void @v_shuffle_v3f32_v4f32__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v3 ; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] @@ -156,6 +163,7 @@ define void @v_shuffle_v3f32_v4f32__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] @@ -168,6 +176,7 @@ define void @v_shuffle_v3f32_v4f32__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] @@ -208,9 +217,10 @@ define void @v_shuffle_v3f32_v4f32__5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -220,9 +230,10 @@ define void @v_shuffle_v3f32_v4f32__5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -239,33 +250,37 @@ define void @v_shuffle_v3f32_v4f32__6_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__6_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__6_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -282,6 +297,7 @@ define void @v_shuffle_v3f32_v4f32__7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v3 ; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] @@ -294,6 +310,7 @@ define void @v_shuffle_v3f32_v4f32__7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] @@ -306,6 +323,7 @@ define void @v_shuffle_v3f32_v4f32__7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] @@ -328,6 +346,7 @@ define void @v_shuffle_v3f32_v4f32__7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v4 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 @@ -344,6 +363,7 @@ define void @v_shuffle_v3f32_v4f32__7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v5 ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -361,6 +381,7 @@ define void @v_shuffle_v3f32_v4f32__7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v5 ; GFX942-NEXT: v_mov_b32_e32 v3, v0 @@ -447,15 +468,16 @@ define void @v_shuffle_v3f32_v4f32__7_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -463,15 +485,16 @@ define void @v_shuffle_v3f32_v4f32__7_2_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v7 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -488,6 +511,7 @@ define void @v_shuffle_v3f32_v4f32__7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] @@ -504,6 +528,7 @@ define void @v_shuffle_v3f32_v4f32__7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] @@ -520,6 +545,7 @@ define void @v_shuffle_v3f32_v4f32__7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] @@ -539,40 +565,44 @@ define void @v_shuffle_v3f32_v4f32__7_3_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3f32_v4f32__7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_4_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_4_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -640,27 +670,30 @@ define void @v_shuffle_v3f32_v4f32__7_6_u(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_6_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_6_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -677,6 +710,7 @@ define void @v_shuffle_v3f32_v4f32__7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v3 ; GFX900-NEXT: v_mov_b32_e32 v1, v3 @@ -690,6 +724,7 @@ define void @v_shuffle_v3f32_v4f32__7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 @@ -703,6 +738,7 @@ define void @v_shuffle_v3f32_v4f32__7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 @@ -1141,10 +1177,11 @@ define void @v_shuffle_v3f32_v4f32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1154,10 +1191,11 @@ define void @v_shuffle_v3f32_v4f32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1167,10 +1205,11 @@ define void @v_shuffle_v3f32_v4f32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1372,10 +1411,11 @@ define void @v_shuffle_v3f32_v4f32__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1385,10 +1425,11 @@ define void @v_shuffle_v3f32_v4f32__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1398,10 +1439,11 @@ define void @v_shuffle_v3f32_v4f32__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1596,6 +1638,7 @@ define void @v_shuffle_v3f32_v4f32__7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v0 @@ -1612,6 +1655,7 @@ define void @v_shuffle_v3f32_v4f32__7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v5 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 @@ -1629,6 +1673,7 @@ define void @v_shuffle_v3f32_v4f32__7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v5 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 @@ -2425,10 +2470,11 @@ define void @v_shuffle_v3f32_v4f32__7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2441,10 +2487,11 @@ define void @v_shuffle_v3f32_v4f32__7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2458,10 +2505,11 @@ define void @v_shuffle_v3f32_v4f32__7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -3645,6 +3693,7 @@ define void @v_shuffle_v3f32_v4f32__u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 @@ -3658,6 +3707,7 @@ define void @v_shuffle_v3f32_v4f32__u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 @@ -3874,6 +3924,7 @@ define void @v_shuffle_v3f32_v4f32__4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 @@ -3887,6 +3938,7 @@ define void @v_shuffle_v3f32_v4f32__4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 @@ -4092,6 +4144,7 @@ define void @v_shuffle_v3f32_v4f32__7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] @@ -4108,6 +4161,7 @@ define void @v_shuffle_v3f32_v4f32__7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] @@ -4529,9 +4583,10 @@ define void @v_shuffle_v3f32_v4f32__1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4541,9 +4596,10 @@ define void @v_shuffle_v3f32_v4f32__1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4559,33 +4615,37 @@ define void @v_shuffle_v3f32_v4f32__2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__2_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__2_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -4601,6 +4661,7 @@ define void @v_shuffle_v3f32_v4f32__3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v3 ; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] @@ -4613,6 +4674,7 @@ define void @v_shuffle_v3f32_v4f32__3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] @@ -4625,6 +4687,7 @@ define void @v_shuffle_v3f32_v4f32__3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] @@ -4794,40 +4857,44 @@ define void @v_shuffle_v3f32_v4f32__7_4_4(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3f32_v4f32__7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_u_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5608,40 +5675,44 @@ define void @v_shuffle_v3f32_v4f32__7_5_5(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3f32_v4f32__7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_u_5: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_u_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3f32_v4f32__7_u_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -6800,6 +6871,7 @@ define void @v_shuffle_v3f32_v4f32__u_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 @@ -6813,6 +6885,7 @@ define void @v_shuffle_v3f32_v4f32__u_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 @@ -7213,6 +7286,7 @@ define void @v_shuffle_v3f32_v4f32__7_u_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 @@ -7226,6 +7300,7 @@ define void @v_shuffle_v3f32_v4f32__7_u_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 @@ -7666,6 +7741,7 @@ define void @s_shuffle_v3f32_v4f32__0_u_u() { define void @s_shuffle_v3f32_v4f32__1_u_u() { ; GFX900-LABEL: s_shuffle_v3f32_v4f32__1_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -7678,6 +7754,7 @@ define void @s_shuffle_v3f32_v4f32__1_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v4f32__1_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -7690,6 +7767,7 @@ define void @s_shuffle_v3f32_v4f32__1_u_u() { ; ; GFX942-LABEL: s_shuffle_v3f32_v4f32__1_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -7708,6 +7786,7 @@ define void @s_shuffle_v3f32_v4f32__1_u_u() { define void @s_shuffle_v3f32_v4f32__2_u_u() { ; GFX900-LABEL: s_shuffle_v3f32_v4f32__2_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -7720,6 +7799,7 @@ define void @s_shuffle_v3f32_v4f32__2_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v4f32__2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -7732,6 +7812,7 @@ define void @s_shuffle_v3f32_v4f32__2_u_u() { ; ; GFX942-LABEL: s_shuffle_v3f32_v4f32__2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -7750,6 +7831,7 @@ define void @s_shuffle_v3f32_v4f32__2_u_u() { define void @s_shuffle_v3f32_v4f32__3_u_u() { ; GFX900-LABEL: s_shuffle_v3f32_v4f32__3_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -7762,6 +7844,7 @@ define void @s_shuffle_v3f32_v4f32__3_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v4f32__3_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -7774,6 +7857,7 @@ define void @s_shuffle_v3f32_v4f32__3_u_u() { ; ; GFX942-LABEL: s_shuffle_v3f32_v4f32__3_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -7806,6 +7890,7 @@ define void @s_shuffle_v3f32_v4f32__4_u_u() { define void @s_shuffle_v3f32_v4f32__5_u_u() { ; GFX900-LABEL: s_shuffle_v3f32_v4f32__5_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -7818,6 +7903,7 @@ define void @s_shuffle_v3f32_v4f32__5_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v4f32__5_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -7830,6 +7916,7 @@ define void @s_shuffle_v3f32_v4f32__5_u_u() { ; ; GFX942-LABEL: s_shuffle_v3f32_v4f32__5_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -7849,6 +7936,7 @@ define void @s_shuffle_v3f32_v4f32__5_u_u() { define void @s_shuffle_v3f32_v4f32__6_u_u() { ; GFX900-LABEL: s_shuffle_v3f32_v4f32__6_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -7861,6 +7949,7 @@ define void @s_shuffle_v3f32_v4f32__6_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v4f32__6_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -7873,6 +7962,7 @@ define void @s_shuffle_v3f32_v4f32__6_u_u() { ; ; GFX942-LABEL: s_shuffle_v3f32_v4f32__6_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -7892,6 +7982,7 @@ define void @s_shuffle_v3f32_v4f32__6_u_u() { define void @s_shuffle_v3f32_v4f32__7_u_u() { ; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -7904,6 +7995,7 @@ define void @s_shuffle_v3f32_v4f32__7_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -7916,6 +8008,7 @@ define void @s_shuffle_v3f32_v4f32__7_u_u() { ; ; GFX942-LABEL: s_shuffle_v3f32_v4f32__7_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -7939,6 +8032,7 @@ define void @s_shuffle_v3f32_v4f32__7_0_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND @@ -7955,6 +8049,7 @@ define void @s_shuffle_v3f32_v4f32__7_0_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND @@ -7967,6 +8062,7 @@ define void @s_shuffle_v3f32_v4f32__7_0_u() { ; ; GFX942-LABEL: s_shuffle_v3f32_v4f32__7_0_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -8046,6 +8142,7 @@ define void @s_shuffle_v3f32_v4f32__7_2_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND @@ -8062,6 +8159,7 @@ define void @s_shuffle_v3f32_v4f32__7_2_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND @@ -8074,6 +8172,7 @@ define void @s_shuffle_v3f32_v4f32__7_2_u() { ; ; GFX942-LABEL: s_shuffle_v3f32_v4f32__7_2_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -8101,6 +8200,7 @@ define void @s_shuffle_v3f32_v4f32__7_3_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND @@ -8117,6 +8217,7 @@ define void @s_shuffle_v3f32_v4f32__7_3_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND @@ -8129,6 +8230,7 @@ define void @s_shuffle_v3f32_v4f32__7_3_u() { ; ; GFX942-LABEL: s_shuffle_v3f32_v4f32__7_3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -8152,6 +8254,7 @@ define void @s_shuffle_v3f32_v4f32__7_3_u() { define void @s_shuffle_v3f32_v4f32__7_4_u() { ; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_4_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -8165,6 +8268,7 @@ define void @s_shuffle_v3f32_v4f32__7_4_u() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_4_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -8178,6 +8282,7 @@ define void @s_shuffle_v3f32_v4f32__7_4_u() { ; ; GFX942-LABEL: s_shuffle_v3f32_v4f32__7_4_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -8217,6 +8322,7 @@ define void @s_shuffle_v3f32_v4f32__7_5_u() { define void @s_shuffle_v3f32_v4f32__7_6_u() { ; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_6_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -8230,6 +8336,7 @@ define void @s_shuffle_v3f32_v4f32__7_6_u() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_6_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -8243,6 +8350,7 @@ define void @s_shuffle_v3f32_v4f32__7_6_u() { ; ; GFX942-LABEL: s_shuffle_v3f32_v4f32__7_6_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -8263,6 +8371,7 @@ define void @s_shuffle_v3f32_v4f32__7_6_u() { define void @s_shuffle_v3f32_v4f32__7_7_u() { ; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_7_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -8276,6 +8385,7 @@ define void @s_shuffle_v3f32_v4f32__7_7_u() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_7_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -8289,6 +8399,7 @@ define void @s_shuffle_v3f32_v4f32__7_7_u() { ; ; GFX942-LABEL: s_shuffle_v3f32_v4f32__7_7_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -8705,6 +8816,7 @@ define void @s_shuffle_v3f32_v4f32__7_7_7() { define void @s_shuffle_v3f32_v4f32__u_0_0() { ; GFX900-LABEL: s_shuffle_v3f32_v4f32__u_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -8718,6 +8830,7 @@ define void @s_shuffle_v3f32_v4f32__u_0_0() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v4f32__u_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -8731,6 +8844,7 @@ define void @s_shuffle_v3f32_v4f32__u_0_0() { ; ; GFX942-LABEL: s_shuffle_v3f32_v4f32__u_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -8913,6 +9027,7 @@ define void @s_shuffle_v3f32_v4f32__3_0_0() { define void @s_shuffle_v3f32_v4f32__4_0_0() { ; GFX900-LABEL: s_shuffle_v3f32_v4f32__4_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -8926,6 +9041,7 @@ define void @s_shuffle_v3f32_v4f32__4_0_0() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v4f32__4_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -8939,6 +9055,7 @@ define void @s_shuffle_v3f32_v4f32__4_0_0() { ; ; GFX942-LABEL: s_shuffle_v3f32_v4f32__4_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -9136,6 +9253,7 @@ define void @s_shuffle_v3f32_v4f32__7_u_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND @@ -9152,6 +9270,7 @@ define void @s_shuffle_v3f32_v4f32__7_u_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND @@ -9164,6 +9283,7 @@ define void @s_shuffle_v3f32_v4f32__7_u_0() { ; ; GFX942-LABEL: s_shuffle_v3f32_v4f32__7_u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -9812,6 +9932,7 @@ define void @s_shuffle_v3f32_v4f32__7_u_1() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND @@ -9828,6 +9949,7 @@ define void @s_shuffle_v3f32_v4f32__7_u_1() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND @@ -9840,6 +9962,7 @@ define void @s_shuffle_v3f32_v4f32__7_u_1() { ; ; GFX942-LABEL: s_shuffle_v3f32_v4f32__7_u_1: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -10866,6 +10989,7 @@ define void @s_shuffle_v3f32_v4f32__7_6_2() { define void @s_shuffle_v3f32_v4f32__u_3_3() { ; GFX900-LABEL: s_shuffle_v3f32_v4f32__u_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -10879,6 +11003,7 @@ define void @s_shuffle_v3f32_v4f32__u_3_3() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v4f32__u_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -10892,6 +11017,7 @@ define void @s_shuffle_v3f32_v4f32__u_3_3() { ; ; GFX942-LABEL: s_shuffle_v3f32_v4f32__u_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -11074,6 +11200,7 @@ define void @s_shuffle_v3f32_v4f32__3_3_3() { define void @s_shuffle_v3f32_v4f32__4_3_3() { ; GFX900-LABEL: s_shuffle_v3f32_v4f32__4_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -11087,6 +11214,7 @@ define void @s_shuffle_v3f32_v4f32__4_3_3() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v4f32__4_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -11100,6 +11228,7 @@ define void @s_shuffle_v3f32_v4f32__4_3_3() { ; ; GFX942-LABEL: s_shuffle_v3f32_v4f32__4_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -11297,6 +11426,7 @@ define void @s_shuffle_v3f32_v4f32__7_u_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND @@ -11313,6 +11443,7 @@ define void @s_shuffle_v3f32_v4f32__7_u_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND @@ -11325,6 +11456,7 @@ define void @s_shuffle_v3f32_v4f32__7_u_3() { ; ; GFX942-LABEL: s_shuffle_v3f32_v4f32__7_u_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -11744,6 +11876,7 @@ define void @s_shuffle_v3f32_v4f32__0_4_4() { define void @s_shuffle_v3f32_v4f32__1_4_4() { ; GFX900-LABEL: s_shuffle_v3f32_v4f32__1_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -11756,6 +11889,7 @@ define void @s_shuffle_v3f32_v4f32__1_4_4() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v4f32__1_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -11768,6 +11902,7 @@ define void @s_shuffle_v3f32_v4f32__1_4_4() { ; ; GFX942-LABEL: s_shuffle_v3f32_v4f32__1_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -11786,6 +11921,7 @@ define void @s_shuffle_v3f32_v4f32__1_4_4() { define void @s_shuffle_v3f32_v4f32__2_4_4() { ; GFX900-LABEL: s_shuffle_v3f32_v4f32__2_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -11798,6 +11934,7 @@ define void @s_shuffle_v3f32_v4f32__2_4_4() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v4f32__2_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -11810,6 +11947,7 @@ define void @s_shuffle_v3f32_v4f32__2_4_4() { ; ; GFX942-LABEL: s_shuffle_v3f32_v4f32__2_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -11828,6 +11966,7 @@ define void @s_shuffle_v3f32_v4f32__2_4_4() { define void @s_shuffle_v3f32_v4f32__3_4_4() { ; GFX900-LABEL: s_shuffle_v3f32_v4f32__3_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -11840,6 +11979,7 @@ define void @s_shuffle_v3f32_v4f32__3_4_4() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v4f32__3_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -11852,6 +11992,7 @@ define void @s_shuffle_v3f32_v4f32__3_4_4() { ; ; GFX942-LABEL: s_shuffle_v3f32_v4f32__3_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12031,6 +12172,7 @@ define void @s_shuffle_v3f32_v4f32__7_4_4() { define void @s_shuffle_v3f32_v4f32__7_u_4() { ; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_u_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12044,6 +12186,7 @@ define void @s_shuffle_v3f32_v4f32__7_u_4() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12057,6 +12200,7 @@ define void @s_shuffle_v3f32_v4f32__7_u_4() { ; ; GFX942-LABEL: s_shuffle_v3f32_v4f32__7_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12722,6 +12866,7 @@ define void @s_shuffle_v3f32_v4f32__7_5_5() { define void @s_shuffle_v3f32_v4f32__7_u_5() { ; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_u_5: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12735,6 +12880,7 @@ define void @s_shuffle_v3f32_v4f32__7_u_5() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_u_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12748,6 +12894,7 @@ define void @s_shuffle_v3f32_v4f32__7_u_5() { ; ; GFX942-LABEL: s_shuffle_v3f32_v4f32__7_u_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -13720,6 +13867,7 @@ define void @s_shuffle_v3f32_v4f32__7_5_6() { define void @s_shuffle_v3f32_v4f32__u_7_7() { ; GFX900-LABEL: s_shuffle_v3f32_v4f32__u_7_7: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -13733,6 +13881,7 @@ define void @s_shuffle_v3f32_v4f32__u_7_7() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v4f32__u_7_7: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -13746,6 +13895,7 @@ define void @s_shuffle_v3f32_v4f32__u_7_7() { ; ; GFX942-LABEL: s_shuffle_v3f32_v4f32__u_7_7: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -14113,6 +14263,7 @@ define void @s_shuffle_v3f32_v4f32__6_7_7() { define void @s_shuffle_v3f32_v4f32__7_u_7() { ; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_u_7: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -14126,6 +14277,7 @@ define void @s_shuffle_v3f32_v4f32__7_u_7() { ; ; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_u_7: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -14139,6 +14291,7 @@ define void @s_shuffle_v3f32_v4f32__7_u_7() { ; ; GFX942-LABEL: s_shuffle_v3f32_v4f32__7_u_7: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i16.v3i16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i16.v3i16.ll index 0cf6da3659dde..98c90cfac2fe6 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i16.v3i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i16.v3i16.ll @@ -4817,6 +4817,7 @@ define void @s_shuffle_v3i16_v3i16__1_u_u() { define void @s_shuffle_v3i16_v3i16__2_u_u() { ; GFX900-LABEL: s_shuffle_v3i16_v3i16__2_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -4829,6 +4830,7 @@ define void @s_shuffle_v3i16_v3i16__2_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3i16_v3i16__2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -4841,6 +4843,7 @@ define void @s_shuffle_v3i16_v3i16__2_u_u() { ; ; GFX942-LABEL: s_shuffle_v3i16_v3i16__2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -4923,6 +4926,7 @@ define void @s_shuffle_v3i16_v3i16__4_u_u() { define void @s_shuffle_v3i16_v3i16__5_u_u() { ; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -4935,6 +4939,7 @@ define void @s_shuffle_v3i16_v3i16__5_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -4947,6 +4952,7 @@ define void @s_shuffle_v3i16_v3i16__5_u_u() { ; ; GFX942-LABEL: s_shuffle_v3i16_v3i16__5_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -7387,6 +7393,7 @@ define void @s_shuffle_v3i16_v3i16__1_3_3() { define void @s_shuffle_v3i16_v3i16__2_3_3() { ; GFX900-LABEL: s_shuffle_v3i16_v3i16__2_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -7399,6 +7406,7 @@ define void @s_shuffle_v3i16_v3i16__2_3_3() { ; ; GFX90A-LABEL: s_shuffle_v3i16_v3i16__2_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -7411,6 +7419,7 @@ define void @s_shuffle_v3i16_v3i16__2_3_3() { ; ; GFX942-LABEL: s_shuffle_v3i16_v3i16__2_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i16.v4i16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i16.v4i16.ll index 977055e546bba..a8736558b36e9 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i16.v4i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i16.v4i16.ll @@ -7957,6 +7957,7 @@ define void @s_shuffle_v3i16_v4i16__1_u_u() { define void @s_shuffle_v3i16_v4i16__2_u_u() { ; GFX900-LABEL: s_shuffle_v3i16_v4i16__2_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -7969,6 +7970,7 @@ define void @s_shuffle_v3i16_v4i16__2_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3i16_v4i16__2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -7981,6 +7983,7 @@ define void @s_shuffle_v3i16_v4i16__2_u_u() { ; ; GFX942-LABEL: s_shuffle_v3i16_v4i16__2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -8102,6 +8105,7 @@ define void @s_shuffle_v3i16_v4i16__5_u_u() { define void @s_shuffle_v3i16_v4i16__6_u_u() { ; GFX900-LABEL: s_shuffle_v3i16_v4i16__6_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -8114,6 +8118,7 @@ define void @s_shuffle_v3i16_v4i16__6_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3i16_v4i16__6_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -8126,6 +8131,7 @@ define void @s_shuffle_v3i16_v4i16__6_u_u() { ; ; GFX942-LABEL: s_shuffle_v3i16_v4i16__6_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -12256,6 +12262,7 @@ define void @s_shuffle_v3i16_v4i16__1_4_4() { define void @s_shuffle_v3i16_v4i16__2_4_4() { ; GFX900-LABEL: s_shuffle_v3i16_v4i16__2_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -12268,6 +12275,7 @@ define void @s_shuffle_v3i16_v4i16__2_4_4() { ; ; GFX90A-LABEL: s_shuffle_v3i16_v4i16__2_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -12280,6 +12288,7 @@ define void @s_shuffle_v3i16_v4i16__2_4_4() { ; ; GFX942-LABEL: s_shuffle_v3i16_v4i16__2_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll index ea4fac3b1d2b1..96257966cfc3c 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll @@ -59,11 +59,12 @@ define void @v_shuffle_v3i32_v2i32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -71,11 +72,12 @@ define void @v_shuffle_v3i32_v2i32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -83,11 +85,12 @@ define void @v_shuffle_v3i32_v2i32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx3 v2, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -112,11 +115,12 @@ define void @v_shuffle_v3i32_v2i32__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -124,11 +128,12 @@ define void @v_shuffle_v3i32_v2i32__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -136,11 +141,12 @@ define void @v_shuffle_v3i32_v2i32__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx3 v2, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -170,15 +176,16 @@ define void @v_shuffle_v3i32_v2i32__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -186,15 +193,16 @@ define void @v_shuffle_v3i32_v2i32__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -272,28 +280,30 @@ define void @v_shuffle_v3i32_v2i32__3_2_u(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_2_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_2_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -560,26 +570,29 @@ define void @v_shuffle_v3i32_v2i32__u_0_0(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__u_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__u_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -695,26 +708,29 @@ define void @v_shuffle_v3i32_v2i32__2_0_0(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__2_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__2_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -784,15 +800,16 @@ define void @v_shuffle_v3i32_v2i32__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -800,14 +817,16 @@ define void @v_shuffle_v3i32_v2i32__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -815,14 +834,16 @@ define void @v_shuffle_v3i32_v2i32__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -1192,15 +1213,16 @@ define void @v_shuffle_v3i32_v2i32__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1208,15 +1230,16 @@ define void @v_shuffle_v3i32_v2i32__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -1394,11 +1417,12 @@ define void @v_shuffle_v3i32_v2i32__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1406,11 +1430,12 @@ define void @v_shuffle_v3i32_v2i32__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1418,11 +1443,12 @@ define void @v_shuffle_v3i32_v2i32__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx3 v2, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -1492,39 +1518,44 @@ define void @v_shuffle_v3i32_v2i32__3_2_2(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3i32_v2i32__3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: global_store_dwordx3 v1, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v2i32__3_u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: global_store_dwordx3 v1, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -1857,12 +1888,13 @@ define void @v_shuffle_v3i32_v2i32__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1870,12 +1902,13 @@ define void @v_shuffle_v3i32_v2i32__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -2101,6 +2134,7 @@ define void @s_shuffle_v3i32_v2i32__0_u_u() { define void @s_shuffle_v3i32_v2i32__1_u_u() { ; GFX900-LABEL: s_shuffle_v3i32_v2i32__1_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -2113,6 +2147,7 @@ define void @s_shuffle_v3i32_v2i32__1_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v2i32__1_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -2125,6 +2160,7 @@ define void @s_shuffle_v3i32_v2i32__1_u_u() { ; ; GFX942-LABEL: s_shuffle_v3i32_v2i32__1_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -2157,6 +2193,7 @@ define void @s_shuffle_v3i32_v2i32__2_u_u() { define void @s_shuffle_v3i32_v2i32__3_u_u() { ; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -2169,6 +2206,7 @@ define void @s_shuffle_v3i32_v2i32__3_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -2181,6 +2219,7 @@ define void @s_shuffle_v3i32_v2i32__3_u_u() { ; ; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -2200,6 +2239,7 @@ define void @s_shuffle_v3i32_v2i32__3_u_u() { define void @s_shuffle_v3i32_v2i32__3_0_u() { ; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_0_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -2216,6 +2256,7 @@ define void @s_shuffle_v3i32_v2i32__3_0_u() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_0_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -2232,6 +2273,7 @@ define void @s_shuffle_v3i32_v2i32__3_0_u() { ; ; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_0_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -2307,6 +2349,7 @@ define void @s_shuffle_v3i32_v2i32__3_1_u() { define void @s_shuffle_v3i32_v2i32__3_2_u() { ; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_2_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -2320,6 +2363,7 @@ define void @s_shuffle_v3i32_v2i32__3_2_u() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_2_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -2333,6 +2377,7 @@ define void @s_shuffle_v3i32_v2i32__3_2_u() { ; ; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_2_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -2551,6 +2596,7 @@ define void @s_shuffle_v3i32_v2i32__3_3_3() { define void @s_shuffle_v3i32_v2i32__u_0_0() { ; GFX900-LABEL: s_shuffle_v3i32_v2i32__u_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -2564,6 +2610,7 @@ define void @s_shuffle_v3i32_v2i32__u_0_0() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v2i32__u_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -2577,6 +2624,7 @@ define void @s_shuffle_v3i32_v2i32__u_0_0() { ; ; GFX942-LABEL: s_shuffle_v3i32_v2i32__u_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -2663,6 +2711,7 @@ define void @s_shuffle_v3i32_v2i32__1_0_0() { define void @s_shuffle_v3i32_v2i32__2_0_0() { ; GFX900-LABEL: s_shuffle_v3i32_v2i32__2_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -2676,6 +2725,7 @@ define void @s_shuffle_v3i32_v2i32__2_0_0() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v2i32__2_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -2689,6 +2739,7 @@ define void @s_shuffle_v3i32_v2i32__2_0_0() { ; ; GFX942-LABEL: s_shuffle_v3i32_v2i32__2_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -2766,6 +2817,7 @@ define void @s_shuffle_v3i32_v2i32__3_0_0() { define void @s_shuffle_v3i32_v2i32__3_u_0() { ; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_u_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -2782,6 +2834,7 @@ define void @s_shuffle_v3i32_v2i32__3_u_0() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_u_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -2798,6 +2851,7 @@ define void @s_shuffle_v3i32_v2i32__3_u_0() { ; ; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -3065,6 +3119,7 @@ define void @s_shuffle_v3i32_v2i32__3_1_1() { define void @s_shuffle_v3i32_v2i32__3_u_1() { ; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_u_1: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -3081,6 +3136,7 @@ define void @s_shuffle_v3i32_v2i32__3_u_1() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_u_1: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -3097,6 +3153,7 @@ define void @s_shuffle_v3i32_v2i32__3_u_1() { ; ; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_u_1: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -3290,6 +3347,7 @@ define void @s_shuffle_v3i32_v2i32__0_2_2() { define void @s_shuffle_v3i32_v2i32__1_2_2() { ; GFX900-LABEL: s_shuffle_v3i32_v2i32__1_2_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -3302,6 +3360,7 @@ define void @s_shuffle_v3i32_v2i32__1_2_2() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v2i32__1_2_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -3314,6 +3373,7 @@ define void @s_shuffle_v3i32_v2i32__1_2_2() { ; ; GFX942-LABEL: s_shuffle_v3i32_v2i32__1_2_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -3395,6 +3455,7 @@ define void @s_shuffle_v3i32_v2i32__3_2_2() { define void @s_shuffle_v3i32_v2i32__3_u_2() { ; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -3408,6 +3469,7 @@ define void @s_shuffle_v3i32_v2i32__3_u_2() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -3421,6 +3483,7 @@ define void @s_shuffle_v3i32_v2i32__3_u_2() { ; ; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -3702,6 +3765,7 @@ define void @s_shuffle_v3i32_v2i32__2_3_3() { define void @s_shuffle_v3i32_v2i32__3_u_3() { ; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_u_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -3715,6 +3779,7 @@ define void @s_shuffle_v3i32_v2i32__3_u_3() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_u_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -3728,6 +3793,7 @@ define void @s_shuffle_v3i32_v2i32__3_u_3() { ; ; GFX942-LABEL: s_shuffle_v3i32_v2i32__3_u_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll index 7061c13b28d03..a8a93e4d22aa8 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll @@ -61,9 +61,10 @@ define void @v_shuffle_v3i32_v3i32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -73,9 +74,10 @@ define void @v_shuffle_v3i32_v3i32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -85,9 +87,10 @@ define void @v_shuffle_v3i32_v3i32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -99,37 +102,41 @@ define void @v_shuffle_v3i32_v3i32__1_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3i32_v3i32__2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__2_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -156,9 +163,10 @@ define void @v_shuffle_v3i32_v3i32__4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -168,9 +176,10 @@ define void @v_shuffle_v3i32_v3i32__4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -180,9 +189,10 @@ define void @v_shuffle_v3i32_v3i32__4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -195,37 +205,41 @@ define void @v_shuffle_v3i32_v3i32__4_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3i32_v3i32__5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -242,13 +256,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -258,13 +273,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -274,14 +290,14 @@ define void @v_shuffle_v3i32_v3i32__5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8 ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -347,49 +363,53 @@ define void @v_shuffle_v3i32_v3i32__5_1_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3i32_v3i32__5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_2_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx3 v9, v[6:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_2_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[8:10], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_2_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx3 v3, v[8:10], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -402,39 +422,44 @@ define void @v_shuffle_v3i32_v3i32__5_2_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3i32_v3i32__5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -490,40 +515,44 @@ define void @v_shuffle_v3i32_v3i32__5_4_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3i32_v3i32__5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_5_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_5_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_5_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -859,10 +888,11 @@ define void @v_shuffle_v3i32_v3i32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -872,10 +902,11 @@ define void @v_shuffle_v3i32_v3i32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -885,10 +916,11 @@ define void @v_shuffle_v3i32_v3i32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1043,10 +1075,11 @@ define void @v_shuffle_v3i32_v3i32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1056,10 +1089,11 @@ define void @v_shuffle_v3i32_v3i32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1069,10 +1103,11 @@ define void @v_shuffle_v3i32_v3i32__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1206,13 +1241,14 @@ define void @v_shuffle_v3i32_v3i32__5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1222,13 +1258,14 @@ define void @v_shuffle_v3i32_v3i32__5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1238,14 +1275,14 @@ define void @v_shuffle_v3i32_v3i32__5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8 ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1818,13 +1855,14 @@ define void @v_shuffle_v3i32_v3i32__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1834,13 +1872,14 @@ define void @v_shuffle_v3i32_v3i32__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1850,14 +1889,14 @@ define void @v_shuffle_v3i32_v3i32__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8 ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2754,9 +2793,10 @@ define void @v_shuffle_v3i32_v3i32__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2766,9 +2806,10 @@ define void @v_shuffle_v3i32_v3i32__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2778,9 +2819,10 @@ define void @v_shuffle_v3i32_v3i32__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2792,37 +2834,41 @@ define void @v_shuffle_v3i32_v3i32__1_3_3(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3i32_v3i32__2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__2_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__2_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__2_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2941,39 +2987,44 @@ define void @v_shuffle_v3i32_v3i32__5_3_3(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3i32_v3i32__5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_u_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_u_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_u_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3544,40 +3595,44 @@ define void @v_shuffle_v3i32_v3i32__5_4_4(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3i32_v3i32__5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_u_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v3i32__5_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4462,6 +4517,7 @@ define void @s_shuffle_v3i32_v3i32__0_u_u() { define void @s_shuffle_v3i32_v3i32__1_u_u() { ; GFX900-LABEL: s_shuffle_v3i32_v3i32__1_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -4474,6 +4530,7 @@ define void @s_shuffle_v3i32_v3i32__1_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v3i32__1_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -4486,6 +4543,7 @@ define void @s_shuffle_v3i32_v3i32__1_u_u() { ; ; GFX942-LABEL: s_shuffle_v3i32_v3i32__1_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -4504,6 +4562,7 @@ define void @s_shuffle_v3i32_v3i32__1_u_u() { define void @s_shuffle_v3i32_v3i32__2_u_u() { ; GFX900-LABEL: s_shuffle_v3i32_v3i32__2_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -4516,6 +4575,7 @@ define void @s_shuffle_v3i32_v3i32__2_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v3i32__2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -4528,6 +4588,7 @@ define void @s_shuffle_v3i32_v3i32__2_u_u() { ; ; GFX942-LABEL: s_shuffle_v3i32_v3i32__2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -4560,6 +4621,7 @@ define void @s_shuffle_v3i32_v3i32__3_u_u() { define void @s_shuffle_v3i32_v3i32__4_u_u() { ; GFX900-LABEL: s_shuffle_v3i32_v3i32__4_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -4572,6 +4634,7 @@ define void @s_shuffle_v3i32_v3i32__4_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v3i32__4_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -4584,6 +4647,7 @@ define void @s_shuffle_v3i32_v3i32__4_u_u() { ; ; GFX942-LABEL: s_shuffle_v3i32_v3i32__4_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -4603,6 +4667,7 @@ define void @s_shuffle_v3i32_v3i32__4_u_u() { define void @s_shuffle_v3i32_v3i32__5_u_u() { ; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -4615,6 +4680,7 @@ define void @s_shuffle_v3i32_v3i32__5_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -4627,6 +4693,7 @@ define void @s_shuffle_v3i32_v3i32__5_u_u() { ; ; GFX942-LABEL: s_shuffle_v3i32_v3i32__5_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -4646,14 +4713,15 @@ define void @s_shuffle_v3i32_v3i32__5_u_u() { define void @s_shuffle_v3i32_v3i32__5_0_u() { ; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_0_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s9, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] @@ -4662,14 +4730,15 @@ define void @s_shuffle_v3i32_v3i32__5_0_u() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_0_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s9, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] @@ -4678,6 +4747,7 @@ define void @s_shuffle_v3i32_v3i32__5_0_u() { ; ; GFX942-LABEL: s_shuffle_v3i32_v3i32__5_0_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -4753,14 +4823,15 @@ define void @s_shuffle_v3i32_v3i32__5_1_u() { define void @s_shuffle_v3i32_v3i32__5_2_u() { ; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_2_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s9, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] @@ -4769,14 +4840,15 @@ define void @s_shuffle_v3i32_v3i32__5_2_u() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_2_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s9, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] @@ -4785,6 +4857,7 @@ define void @s_shuffle_v3i32_v3i32__5_2_u() { ; ; GFX942-LABEL: s_shuffle_v3i32_v3i32__5_2_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -4808,6 +4881,7 @@ define void @s_shuffle_v3i32_v3i32__5_2_u() { define void @s_shuffle_v3i32_v3i32__5_3_u() { ; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -4821,6 +4895,7 @@ define void @s_shuffle_v3i32_v3i32__5_3_u() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -4834,6 +4909,7 @@ define void @s_shuffle_v3i32_v3i32__5_3_u() { ; ; GFX942-LABEL: s_shuffle_v3i32_v3i32__5_3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -4873,6 +4949,7 @@ define void @s_shuffle_v3i32_v3i32__5_4_u() { define void @s_shuffle_v3i32_v3i32__5_5_u() { ; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_5_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -4886,6 +4963,7 @@ define void @s_shuffle_v3i32_v3i32__5_5_u() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_5_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -4899,6 +4977,7 @@ define void @s_shuffle_v3i32_v3i32__5_5_u() { ; ; GFX942-LABEL: s_shuffle_v3i32_v3i32__5_5_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -5208,6 +5287,7 @@ define void @s_shuffle_v3i32_v3i32__5_5_5() { define void @s_shuffle_v3i32_v3i32__u_0_0() { ; GFX900-LABEL: s_shuffle_v3i32_v3i32__u_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -5221,6 +5301,7 @@ define void @s_shuffle_v3i32_v3i32__u_0_0() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v3i32__u_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -5234,6 +5315,7 @@ define void @s_shuffle_v3i32_v3i32__u_0_0() { ; ; GFX942-LABEL: s_shuffle_v3i32_v3i32__u_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -5368,6 +5450,7 @@ define void @s_shuffle_v3i32_v3i32__2_0_0() { define void @s_shuffle_v3i32_v3i32__3_0_0() { ; GFX900-LABEL: s_shuffle_v3i32_v3i32__3_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -5381,6 +5464,7 @@ define void @s_shuffle_v3i32_v3i32__3_0_0() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v3i32__3_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -5394,6 +5478,7 @@ define void @s_shuffle_v3i32_v3i32__3_0_0() { ; ; GFX942-LABEL: s_shuffle_v3i32_v3i32__3_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -5529,14 +5614,15 @@ define void @s_shuffle_v3i32_v3i32__5_0_0() { define void @s_shuffle_v3i32_v3i32__5_u_0() { ; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_u_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] @@ -5545,14 +5631,15 @@ define void @s_shuffle_v3i32_v3i32__5_u_0() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_u_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] @@ -5561,6 +5648,7 @@ define void @s_shuffle_v3i32_v3i32__5_u_0() { ; ; GFX942-LABEL: s_shuffle_v3i32_v3i32__5_u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -6015,14 +6103,15 @@ define void @s_shuffle_v3i32_v3i32__5_1_1() { define void @s_shuffle_v3i32_v3i32__5_u_1() { ; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_u_1: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s10, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] @@ -6031,14 +6120,15 @@ define void @s_shuffle_v3i32_v3i32__5_u_1() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_u_1: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] @@ -6047,6 +6137,7 @@ define void @s_shuffle_v3i32_v3i32__5_u_1() { ; ; GFX942-LABEL: s_shuffle_v3i32_v3i32__5_u_1: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -6827,6 +6918,7 @@ define void @s_shuffle_v3i32_v3i32__0_3_3() { define void @s_shuffle_v3i32_v3i32__1_3_3() { ; GFX900-LABEL: s_shuffle_v3i32_v3i32__1_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -6839,6 +6931,7 @@ define void @s_shuffle_v3i32_v3i32__1_3_3() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v3i32__1_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -6851,6 +6944,7 @@ define void @s_shuffle_v3i32_v3i32__1_3_3() { ; ; GFX942-LABEL: s_shuffle_v3i32_v3i32__1_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -6869,6 +6963,7 @@ define void @s_shuffle_v3i32_v3i32__1_3_3() { define void @s_shuffle_v3i32_v3i32__2_3_3() { ; GFX900-LABEL: s_shuffle_v3i32_v3i32__2_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -6881,6 +6976,7 @@ define void @s_shuffle_v3i32_v3i32__2_3_3() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v3i32__2_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -6893,6 +6989,7 @@ define void @s_shuffle_v3i32_v3i32__2_3_3() { ; ; GFX942-LABEL: s_shuffle_v3i32_v3i32__2_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7023,6 +7120,7 @@ define void @s_shuffle_v3i32_v3i32__5_3_3() { define void @s_shuffle_v3i32_v3i32__5_u_3() { ; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_u_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -7036,6 +7134,7 @@ define void @s_shuffle_v3i32_v3i32__5_u_3() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_u_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -7049,6 +7148,7 @@ define void @s_shuffle_v3i32_v3i32__5_u_3() { ; ; GFX942-LABEL: s_shuffle_v3i32_v3i32__5_u_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7532,6 +7632,7 @@ define void @s_shuffle_v3i32_v3i32__5_4_4() { define void @s_shuffle_v3i32_v3i32__5_u_4() { ; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_u_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -7545,6 +7646,7 @@ define void @s_shuffle_v3i32_v3i32__5_u_4() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -7558,6 +7660,7 @@ define void @s_shuffle_v3i32_v3i32__5_u_4() { ; ; GFX942-LABEL: s_shuffle_v3i32_v3i32__5_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll index 11d1897d0449f..80e32ef57442a 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll @@ -72,9 +72,10 @@ define void @v_shuffle_v3i32_v4i32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -84,9 +85,10 @@ define void @v_shuffle_v3i32_v4i32__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -102,33 +104,37 @@ define void @v_shuffle_v3i32_v4i32__2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -144,6 +150,7 @@ define void @v_shuffle_v3i32_v4i32__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v3 ; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] @@ -156,6 +163,7 @@ define void @v_shuffle_v3i32_v4i32__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] @@ -168,6 +176,7 @@ define void @v_shuffle_v3i32_v4i32__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] @@ -208,9 +217,10 @@ define void @v_shuffle_v3i32_v4i32__5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -220,9 +230,10 @@ define void @v_shuffle_v3i32_v4i32__5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -239,33 +250,37 @@ define void @v_shuffle_v3i32_v4i32__6_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__6_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__6_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -282,6 +297,7 @@ define void @v_shuffle_v3i32_v4i32__7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v3 ; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] @@ -294,6 +310,7 @@ define void @v_shuffle_v3i32_v4i32__7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] @@ -306,6 +323,7 @@ define void @v_shuffle_v3i32_v4i32__7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] @@ -328,6 +346,7 @@ define void @v_shuffle_v3i32_v4i32__7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v4 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 @@ -344,6 +363,7 @@ define void @v_shuffle_v3i32_v4i32__7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v5 ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -361,6 +381,7 @@ define void @v_shuffle_v3i32_v4i32__7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v5 ; GFX942-NEXT: v_mov_b32_e32 v3, v0 @@ -447,15 +468,16 @@ define void @v_shuffle_v3i32_v4i32__7_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -463,15 +485,16 @@ define void @v_shuffle_v3i32_v4i32__7_2_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v7 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -488,6 +511,7 @@ define void @v_shuffle_v3i32_v4i32__7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] @@ -504,6 +528,7 @@ define void @v_shuffle_v3i32_v4i32__7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] @@ -520,6 +545,7 @@ define void @v_shuffle_v3i32_v4i32__7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] @@ -539,40 +565,44 @@ define void @v_shuffle_v3i32_v4i32__7_3_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3i32_v4i32__7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_4_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_4_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -640,27 +670,30 @@ define void @v_shuffle_v3i32_v4i32__7_6_u(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_6_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_6_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -677,6 +710,7 @@ define void @v_shuffle_v3i32_v4i32__7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v3 ; GFX900-NEXT: v_mov_b32_e32 v1, v3 @@ -690,6 +724,7 @@ define void @v_shuffle_v3i32_v4i32__7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 @@ -703,6 +738,7 @@ define void @v_shuffle_v3i32_v4i32__7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 @@ -1141,10 +1177,11 @@ define void @v_shuffle_v3i32_v4i32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1154,10 +1191,11 @@ define void @v_shuffle_v3i32_v4i32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1167,10 +1205,11 @@ define void @v_shuffle_v3i32_v4i32__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1372,10 +1411,11 @@ define void @v_shuffle_v3i32_v4i32__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1385,10 +1425,11 @@ define void @v_shuffle_v3i32_v4i32__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1398,10 +1439,11 @@ define void @v_shuffle_v3i32_v4i32__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1596,6 +1638,7 @@ define void @v_shuffle_v3i32_v4i32__7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v0 @@ -1612,6 +1655,7 @@ define void @v_shuffle_v3i32_v4i32__7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v5 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 @@ -1629,6 +1673,7 @@ define void @v_shuffle_v3i32_v4i32__7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v5 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 @@ -2425,10 +2470,11 @@ define void @v_shuffle_v3i32_v4i32__7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2441,10 +2487,11 @@ define void @v_shuffle_v3i32_v4i32__7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2458,10 +2505,11 @@ define void @v_shuffle_v3i32_v4i32__7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -3645,6 +3693,7 @@ define void @v_shuffle_v3i32_v4i32__u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 @@ -3658,6 +3707,7 @@ define void @v_shuffle_v3i32_v4i32__u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 @@ -3874,6 +3924,7 @@ define void @v_shuffle_v3i32_v4i32__4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 @@ -3887,6 +3938,7 @@ define void @v_shuffle_v3i32_v4i32__4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 @@ -4092,6 +4144,7 @@ define void @v_shuffle_v3i32_v4i32__7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] @@ -4108,6 +4161,7 @@ define void @v_shuffle_v3i32_v4i32__7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] @@ -4529,9 +4583,10 @@ define void @v_shuffle_v3i32_v4i32__1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4541,9 +4596,10 @@ define void @v_shuffle_v3i32_v4i32__1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4559,33 +4615,37 @@ define void @v_shuffle_v3i32_v4i32__2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__2_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__2_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -4601,6 +4661,7 @@ define void @v_shuffle_v3i32_v4i32__3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v3 ; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] @@ -4613,6 +4674,7 @@ define void @v_shuffle_v3i32_v4i32__3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] @@ -4625,6 +4687,7 @@ define void @v_shuffle_v3i32_v4i32__3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] @@ -4794,40 +4857,44 @@ define void @v_shuffle_v3i32_v4i32__7_4_4(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3i32_v4i32__7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_u_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5608,40 +5675,44 @@ define void @v_shuffle_v3i32_v4i32__7_5_5(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3i32_v4i32__7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_u_5: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_u_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i32_v4i32__7_u_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -6800,6 +6871,7 @@ define void @v_shuffle_v3i32_v4i32__u_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 @@ -6813,6 +6885,7 @@ define void @v_shuffle_v3i32_v4i32__u_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 @@ -7213,6 +7286,7 @@ define void @v_shuffle_v3i32_v4i32__7_u_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 @@ -7226,6 +7300,7 @@ define void @v_shuffle_v3i32_v4i32__7_u_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 @@ -7666,6 +7741,7 @@ define void @s_shuffle_v3i32_v4i32__0_u_u() { define void @s_shuffle_v3i32_v4i32__1_u_u() { ; GFX900-LABEL: s_shuffle_v3i32_v4i32__1_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -7678,6 +7754,7 @@ define void @s_shuffle_v3i32_v4i32__1_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v4i32__1_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -7690,6 +7767,7 @@ define void @s_shuffle_v3i32_v4i32__1_u_u() { ; ; GFX942-LABEL: s_shuffle_v3i32_v4i32__1_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -7708,6 +7786,7 @@ define void @s_shuffle_v3i32_v4i32__1_u_u() { define void @s_shuffle_v3i32_v4i32__2_u_u() { ; GFX900-LABEL: s_shuffle_v3i32_v4i32__2_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -7720,6 +7799,7 @@ define void @s_shuffle_v3i32_v4i32__2_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v4i32__2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -7732,6 +7812,7 @@ define void @s_shuffle_v3i32_v4i32__2_u_u() { ; ; GFX942-LABEL: s_shuffle_v3i32_v4i32__2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -7750,6 +7831,7 @@ define void @s_shuffle_v3i32_v4i32__2_u_u() { define void @s_shuffle_v3i32_v4i32__3_u_u() { ; GFX900-LABEL: s_shuffle_v3i32_v4i32__3_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -7762,6 +7844,7 @@ define void @s_shuffle_v3i32_v4i32__3_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v4i32__3_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -7774,6 +7857,7 @@ define void @s_shuffle_v3i32_v4i32__3_u_u() { ; ; GFX942-LABEL: s_shuffle_v3i32_v4i32__3_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -7806,6 +7890,7 @@ define void @s_shuffle_v3i32_v4i32__4_u_u() { define void @s_shuffle_v3i32_v4i32__5_u_u() { ; GFX900-LABEL: s_shuffle_v3i32_v4i32__5_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -7818,6 +7903,7 @@ define void @s_shuffle_v3i32_v4i32__5_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v4i32__5_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -7830,6 +7916,7 @@ define void @s_shuffle_v3i32_v4i32__5_u_u() { ; ; GFX942-LABEL: s_shuffle_v3i32_v4i32__5_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -7849,6 +7936,7 @@ define void @s_shuffle_v3i32_v4i32__5_u_u() { define void @s_shuffle_v3i32_v4i32__6_u_u() { ; GFX900-LABEL: s_shuffle_v3i32_v4i32__6_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -7861,6 +7949,7 @@ define void @s_shuffle_v3i32_v4i32__6_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v4i32__6_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -7873,6 +7962,7 @@ define void @s_shuffle_v3i32_v4i32__6_u_u() { ; ; GFX942-LABEL: s_shuffle_v3i32_v4i32__6_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -7892,6 +7982,7 @@ define void @s_shuffle_v3i32_v4i32__6_u_u() { define void @s_shuffle_v3i32_v4i32__7_u_u() { ; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -7904,6 +7995,7 @@ define void @s_shuffle_v3i32_v4i32__7_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -7916,6 +8008,7 @@ define void @s_shuffle_v3i32_v4i32__7_u_u() { ; ; GFX942-LABEL: s_shuffle_v3i32_v4i32__7_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -7939,6 +8032,7 @@ define void @s_shuffle_v3i32_v4i32__7_0_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND @@ -7955,6 +8049,7 @@ define void @s_shuffle_v3i32_v4i32__7_0_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND @@ -7967,6 +8062,7 @@ define void @s_shuffle_v3i32_v4i32__7_0_u() { ; ; GFX942-LABEL: s_shuffle_v3i32_v4i32__7_0_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -8046,6 +8142,7 @@ define void @s_shuffle_v3i32_v4i32__7_2_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND @@ -8062,6 +8159,7 @@ define void @s_shuffle_v3i32_v4i32__7_2_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND @@ -8074,6 +8172,7 @@ define void @s_shuffle_v3i32_v4i32__7_2_u() { ; ; GFX942-LABEL: s_shuffle_v3i32_v4i32__7_2_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -8101,6 +8200,7 @@ define void @s_shuffle_v3i32_v4i32__7_3_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND @@ -8117,6 +8217,7 @@ define void @s_shuffle_v3i32_v4i32__7_3_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND @@ -8129,6 +8230,7 @@ define void @s_shuffle_v3i32_v4i32__7_3_u() { ; ; GFX942-LABEL: s_shuffle_v3i32_v4i32__7_3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -8152,6 +8254,7 @@ define void @s_shuffle_v3i32_v4i32__7_3_u() { define void @s_shuffle_v3i32_v4i32__7_4_u() { ; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_4_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -8165,6 +8268,7 @@ define void @s_shuffle_v3i32_v4i32__7_4_u() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_4_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -8178,6 +8282,7 @@ define void @s_shuffle_v3i32_v4i32__7_4_u() { ; ; GFX942-LABEL: s_shuffle_v3i32_v4i32__7_4_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -8217,6 +8322,7 @@ define void @s_shuffle_v3i32_v4i32__7_5_u() { define void @s_shuffle_v3i32_v4i32__7_6_u() { ; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_6_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -8230,6 +8336,7 @@ define void @s_shuffle_v3i32_v4i32__7_6_u() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_6_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -8243,6 +8350,7 @@ define void @s_shuffle_v3i32_v4i32__7_6_u() { ; ; GFX942-LABEL: s_shuffle_v3i32_v4i32__7_6_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -8263,6 +8371,7 @@ define void @s_shuffle_v3i32_v4i32__7_6_u() { define void @s_shuffle_v3i32_v4i32__7_7_u() { ; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_7_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -8276,6 +8385,7 @@ define void @s_shuffle_v3i32_v4i32__7_7_u() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_7_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -8289,6 +8399,7 @@ define void @s_shuffle_v3i32_v4i32__7_7_u() { ; ; GFX942-LABEL: s_shuffle_v3i32_v4i32__7_7_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -8705,6 +8816,7 @@ define void @s_shuffle_v3i32_v4i32__7_7_7() { define void @s_shuffle_v3i32_v4i32__u_0_0() { ; GFX900-LABEL: s_shuffle_v3i32_v4i32__u_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -8718,6 +8830,7 @@ define void @s_shuffle_v3i32_v4i32__u_0_0() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v4i32__u_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -8731,6 +8844,7 @@ define void @s_shuffle_v3i32_v4i32__u_0_0() { ; ; GFX942-LABEL: s_shuffle_v3i32_v4i32__u_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -8913,6 +9027,7 @@ define void @s_shuffle_v3i32_v4i32__3_0_0() { define void @s_shuffle_v3i32_v4i32__4_0_0() { ; GFX900-LABEL: s_shuffle_v3i32_v4i32__4_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -8926,6 +9041,7 @@ define void @s_shuffle_v3i32_v4i32__4_0_0() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v4i32__4_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -8939,6 +9055,7 @@ define void @s_shuffle_v3i32_v4i32__4_0_0() { ; ; GFX942-LABEL: s_shuffle_v3i32_v4i32__4_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -9136,6 +9253,7 @@ define void @s_shuffle_v3i32_v4i32__7_u_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND @@ -9152,6 +9270,7 @@ define void @s_shuffle_v3i32_v4i32__7_u_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND @@ -9164,6 +9283,7 @@ define void @s_shuffle_v3i32_v4i32__7_u_0() { ; ; GFX942-LABEL: s_shuffle_v3i32_v4i32__7_u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -9812,6 +9932,7 @@ define void @s_shuffle_v3i32_v4i32__7_u_1() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND @@ -9828,6 +9949,7 @@ define void @s_shuffle_v3i32_v4i32__7_u_1() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND @@ -9840,6 +9962,7 @@ define void @s_shuffle_v3i32_v4i32__7_u_1() { ; ; GFX942-LABEL: s_shuffle_v3i32_v4i32__7_u_1: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -10866,6 +10989,7 @@ define void @s_shuffle_v3i32_v4i32__7_6_2() { define void @s_shuffle_v3i32_v4i32__u_3_3() { ; GFX900-LABEL: s_shuffle_v3i32_v4i32__u_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -10879,6 +11003,7 @@ define void @s_shuffle_v3i32_v4i32__u_3_3() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v4i32__u_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -10892,6 +11017,7 @@ define void @s_shuffle_v3i32_v4i32__u_3_3() { ; ; GFX942-LABEL: s_shuffle_v3i32_v4i32__u_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -11074,6 +11200,7 @@ define void @s_shuffle_v3i32_v4i32__3_3_3() { define void @s_shuffle_v3i32_v4i32__4_3_3() { ; GFX900-LABEL: s_shuffle_v3i32_v4i32__4_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -11087,6 +11214,7 @@ define void @s_shuffle_v3i32_v4i32__4_3_3() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v4i32__4_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -11100,6 +11228,7 @@ define void @s_shuffle_v3i32_v4i32__4_3_3() { ; ; GFX942-LABEL: s_shuffle_v3i32_v4i32__4_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -11297,6 +11426,7 @@ define void @s_shuffle_v3i32_v4i32__7_u_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND @@ -11313,6 +11443,7 @@ define void @s_shuffle_v3i32_v4i32__7_u_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND @@ -11325,6 +11456,7 @@ define void @s_shuffle_v3i32_v4i32__7_u_3() { ; ; GFX942-LABEL: s_shuffle_v3i32_v4i32__7_u_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -11744,6 +11876,7 @@ define void @s_shuffle_v3i32_v4i32__0_4_4() { define void @s_shuffle_v3i32_v4i32__1_4_4() { ; GFX900-LABEL: s_shuffle_v3i32_v4i32__1_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -11756,6 +11889,7 @@ define void @s_shuffle_v3i32_v4i32__1_4_4() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v4i32__1_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -11768,6 +11902,7 @@ define void @s_shuffle_v3i32_v4i32__1_4_4() { ; ; GFX942-LABEL: s_shuffle_v3i32_v4i32__1_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -11786,6 +11921,7 @@ define void @s_shuffle_v3i32_v4i32__1_4_4() { define void @s_shuffle_v3i32_v4i32__2_4_4() { ; GFX900-LABEL: s_shuffle_v3i32_v4i32__2_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -11798,6 +11934,7 @@ define void @s_shuffle_v3i32_v4i32__2_4_4() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v4i32__2_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -11810,6 +11947,7 @@ define void @s_shuffle_v3i32_v4i32__2_4_4() { ; ; GFX942-LABEL: s_shuffle_v3i32_v4i32__2_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -11828,6 +11966,7 @@ define void @s_shuffle_v3i32_v4i32__2_4_4() { define void @s_shuffle_v3i32_v4i32__3_4_4() { ; GFX900-LABEL: s_shuffle_v3i32_v4i32__3_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -11840,6 +11979,7 @@ define void @s_shuffle_v3i32_v4i32__3_4_4() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v4i32__3_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -11852,6 +11992,7 @@ define void @s_shuffle_v3i32_v4i32__3_4_4() { ; ; GFX942-LABEL: s_shuffle_v3i32_v4i32__3_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12031,6 +12172,7 @@ define void @s_shuffle_v3i32_v4i32__7_4_4() { define void @s_shuffle_v3i32_v4i32__7_u_4() { ; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_u_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12044,6 +12186,7 @@ define void @s_shuffle_v3i32_v4i32__7_u_4() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12057,6 +12200,7 @@ define void @s_shuffle_v3i32_v4i32__7_u_4() { ; ; GFX942-LABEL: s_shuffle_v3i32_v4i32__7_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12722,6 +12866,7 @@ define void @s_shuffle_v3i32_v4i32__7_5_5() { define void @s_shuffle_v3i32_v4i32__7_u_5() { ; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_u_5: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12735,6 +12880,7 @@ define void @s_shuffle_v3i32_v4i32__7_u_5() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_u_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12748,6 +12894,7 @@ define void @s_shuffle_v3i32_v4i32__7_u_5() { ; ; GFX942-LABEL: s_shuffle_v3i32_v4i32__7_u_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -13720,6 +13867,7 @@ define void @s_shuffle_v3i32_v4i32__7_5_6() { define void @s_shuffle_v3i32_v4i32__u_7_7() { ; GFX900-LABEL: s_shuffle_v3i32_v4i32__u_7_7: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -13733,6 +13881,7 @@ define void @s_shuffle_v3i32_v4i32__u_7_7() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v4i32__u_7_7: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -13746,6 +13895,7 @@ define void @s_shuffle_v3i32_v4i32__u_7_7() { ; ; GFX942-LABEL: s_shuffle_v3i32_v4i32__u_7_7: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -14113,6 +14263,7 @@ define void @s_shuffle_v3i32_v4i32__6_7_7() { define void @s_shuffle_v3i32_v4i32__7_u_7() { ; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_u_7: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -14126,6 +14277,7 @@ define void @s_shuffle_v3i32_v4i32__7_u_7() { ; ; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_u_7: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -14139,6 +14291,7 @@ define void @s_shuffle_v3i32_v4i32__7_u_7() { ; ; GFX942-LABEL: s_shuffle_v3i32_v4i32__7_u_7: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll index a15fc3212f474..c607bc7dc3960 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll @@ -57,40 +57,44 @@ define void @v_shuffle_v3i64_v2i64__0_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3i64_v2i64__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v2i64__1_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__1_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__1_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -113,40 +117,44 @@ define void @v_shuffle_v3i64_v2i64__2_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3i64_v2i64__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -588,11 +596,12 @@ define void @v_shuffle_v3i64_v2i64__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -602,11 +611,12 @@ define void @v_shuffle_v3i64_v2i64__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -616,11 +626,12 @@ define void @v_shuffle_v3i64_v2i64__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -732,11 +743,12 @@ define void @v_shuffle_v3i64_v2i64__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -746,11 +758,12 @@ define void @v_shuffle_v3i64_v2i64__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -760,11 +773,12 @@ define void @v_shuffle_v3i64_v2i64__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -841,14 +855,15 @@ define void @v_shuffle_v3i64_v2i64__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] @@ -858,14 +873,15 @@ define void @v_shuffle_v3i64_v2i64__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] @@ -875,15 +891,15 @@ define void @v_shuffle_v3i64_v2i64__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 ; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] @@ -1249,16 +1265,17 @@ define void @v_shuffle_v3i64_v2i64__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1266,16 +1283,17 @@ define void @v_shuffle_v3i64_v2i64__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_u_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1283,14 +1301,16 @@ define void @v_shuffle_v3i64_v2i64__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_u_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 ; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -1484,40 +1504,44 @@ define void @v_shuffle_v3i64_v2i64__0_2_2(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3i64_v2i64__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v2i64__1_2_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__1_2_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__1_2_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1589,43 +1613,46 @@ define void @v_shuffle_v3i64_v2i64__3_2_2(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3i64_v2i64__3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1962,43 +1989,46 @@ define void @v_shuffle_v3i64_v2i64__2_3_3(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3i64_v2i64__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_u_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_u_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3i64_v2i64__3_u_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2236,6 +2266,7 @@ define void @s_shuffle_v3i64_v2i64__0_u_u() { define void @s_shuffle_v3i64_v2i64__1_u_u() { ; GFX900-LABEL: s_shuffle_v3i64_v2i64__1_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -2249,6 +2280,7 @@ define void @s_shuffle_v3i64_v2i64__1_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3i64_v2i64__1_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -2262,6 +2294,7 @@ define void @s_shuffle_v3i64_v2i64__1_u_u() { ; ; GFX942-LABEL: s_shuffle_v3i64_v2i64__1_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -2295,6 +2328,7 @@ define void @s_shuffle_v3i64_v2i64__2_u_u() { define void @s_shuffle_v3i64_v2i64__3_u_u() { ; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -2308,6 +2342,7 @@ define void @s_shuffle_v3i64_v2i64__3_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -2321,6 +2356,7 @@ define void @s_shuffle_v3i64_v2i64__3_u_u() { ; ; GFX942-LABEL: s_shuffle_v3i64_v2i64__3_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -2343,13 +2379,14 @@ define void @s_shuffle_v3i64_v2i64__3_0_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -2361,13 +2398,14 @@ define void @s_shuffle_v3i64_v2i64__3_0_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -2377,6 +2415,7 @@ define void @s_shuffle_v3i64_v2i64__3_0_u() { ; ; GFX942-LABEL: s_shuffle_v3i64_v2i64__3_0_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -2457,6 +2496,7 @@ define void @s_shuffle_v3i64_v2i64__3_1_u() { define void @s_shuffle_v3i64_v2i64__3_2_u() { ; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_2_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -2472,6 +2512,7 @@ define void @s_shuffle_v3i64_v2i64__3_2_u() { ; ; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_2_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -2487,6 +2528,7 @@ define void @s_shuffle_v3i64_v2i64__3_2_u() { ; ; GFX942-LABEL: s_shuffle_v3i64_v2i64__3_2_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -2731,6 +2773,7 @@ define void @s_shuffle_v3i64_v2i64__3_3_3() { define void @s_shuffle_v3i64_v2i64__u_0_0() { ; GFX900-LABEL: s_shuffle_v3i64_v2i64__u_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -2746,6 +2789,7 @@ define void @s_shuffle_v3i64_v2i64__u_0_0() { ; ; GFX90A-LABEL: s_shuffle_v3i64_v2i64__u_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -2761,6 +2805,7 @@ define void @s_shuffle_v3i64_v2i64__u_0_0() { ; ; GFX942-LABEL: s_shuffle_v3i64_v2i64__u_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -2860,6 +2905,7 @@ define void @s_shuffle_v3i64_v2i64__1_0_0() { define void @s_shuffle_v3i64_v2i64__2_0_0() { ; GFX900-LABEL: s_shuffle_v3i64_v2i64__2_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -2875,6 +2921,7 @@ define void @s_shuffle_v3i64_v2i64__2_0_0() { ; ; GFX90A-LABEL: s_shuffle_v3i64_v2i64__2_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -2890,6 +2937,7 @@ define void @s_shuffle_v3i64_v2i64__2_0_0() { ; ; GFX942-LABEL: s_shuffle_v3i64_v2i64__2_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -2980,13 +3028,14 @@ define void @s_shuffle_v3i64_v2i64__3_u_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s12, s4 ; GFX900-NEXT: s_mov_b32 s13, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -2998,13 +3047,14 @@ define void @s_shuffle_v3i64_v2i64__3_u_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s12, s4 ; GFX90A-NEXT: s_mov_b32 s13, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -3014,6 +3064,7 @@ define void @s_shuffle_v3i64_v2i64__3_u_0() { ; ; GFX942-LABEL: s_shuffle_v3i64_v2i64__3_u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -3314,13 +3365,14 @@ define void @s_shuffle_v3i64_v2i64__3_u_1() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s12, s6 ; GFX900-NEXT: s_mov_b32 s13, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -3332,13 +3384,14 @@ define void @s_shuffle_v3i64_v2i64__3_u_1() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s12, s6 ; GFX90A-NEXT: s_mov_b32 s13, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -3348,6 +3401,7 @@ define void @s_shuffle_v3i64_v2i64__3_u_1() { ; ; GFX942-LABEL: s_shuffle_v3i64_v2i64__3_u_1: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -3561,6 +3615,7 @@ define void @s_shuffle_v3i64_v2i64__0_2_2() { define void @s_shuffle_v3i64_v2i64__1_2_2() { ; GFX900-LABEL: s_shuffle_v3i64_v2i64__1_2_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -3574,6 +3629,7 @@ define void @s_shuffle_v3i64_v2i64__1_2_2() { ; ; GFX90A-LABEL: s_shuffle_v3i64_v2i64__1_2_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -3587,6 +3643,7 @@ define void @s_shuffle_v3i64_v2i64__1_2_2() { ; ; GFX942-LABEL: s_shuffle_v3i64_v2i64__1_2_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -3678,6 +3735,7 @@ define void @s_shuffle_v3i64_v2i64__3_2_2() { define void @s_shuffle_v3i64_v2i64__3_u_2() { ; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -3693,6 +3751,7 @@ define void @s_shuffle_v3i64_v2i64__3_u_2() { ; ; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -3708,6 +3767,7 @@ define void @s_shuffle_v3i64_v2i64__3_u_2() { ; ; GFX942-LABEL: s_shuffle_v3i64_v2i64__3_u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -4016,6 +4076,7 @@ define void @s_shuffle_v3i64_v2i64__2_3_3() { define void @s_shuffle_v3i64_v2i64__3_u_3() { ; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_u_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -4031,6 +4092,7 @@ define void @s_shuffle_v3i64_v2i64__3_u_3() { ; ; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_u_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -4046,6 +4108,7 @@ define void @s_shuffle_v3i64_v2i64__3_u_3() { ; ; GFX942-LABEL: s_shuffle_v3i64_v2i64__3_u_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v3i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v3i64.ll index f15dd7d2772e5..30197dfd13c5c 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v3i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v3i64.ll @@ -103,6 +103,7 @@ define void @v_shuffle_v3i64_v3i64__2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -116,6 +117,7 @@ define void @v_shuffle_v3i64_v3i64__2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -129,6 +131,7 @@ define void @v_shuffle_v3i64_v3i64__2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -199,6 +202,7 @@ define void @v_shuffle_v3i64_v3i64__5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -212,6 +216,7 @@ define void @v_shuffle_v3i64_v3i64__5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -225,6 +230,7 @@ define void @v_shuffle_v3i64_v3i64__5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -884,11 +890,12 @@ define void @v_shuffle_v3i64_v3i64__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 ; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -898,11 +905,12 @@ define void @v_shuffle_v3i64_v3i64__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -912,11 +920,12 @@ define void @v_shuffle_v3i64_v3i64__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1082,11 +1091,12 @@ define void @v_shuffle_v3i64_v3i64__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 ; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1096,11 +1106,12 @@ define void @v_shuffle_v3i64_v3i64__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1110,11 +1121,12 @@ define void @v_shuffle_v3i64_v3i64__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1249,14 +1261,15 @@ define void @v_shuffle_v3i64_v3i64__5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 ; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] @@ -1266,14 +1279,15 @@ define void @v_shuffle_v3i64_v3i64__5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] @@ -1283,15 +1297,15 @@ define void @v_shuffle_v3i64_v3i64__5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 ; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -1880,16 +1894,17 @@ define void @v_shuffle_v3i64_v3i64__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v0, v8 ; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1897,16 +1912,17 @@ define void @v_shuffle_v3i64_v3i64__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_u_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v0, v8 ; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1914,14 +1930,16 @@ define void @v_shuffle_v3i64_v3i64__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_u_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v8 ; GFX942-NEXT: v_mov_b32_e32 v1, v9 ; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] @@ -2524,6 +2542,7 @@ define void @v_shuffle_v3i64_v3i64__5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:11] @@ -2541,6 +2560,7 @@ define void @v_shuffle_v3i64_v3i64__5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:11] @@ -2558,6 +2578,7 @@ define void @v_shuffle_v3i64_v3i64__5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[6:11] @@ -2918,6 +2939,7 @@ define void @v_shuffle_v3i64_v3i64__2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -2931,6 +2953,7 @@ define void @v_shuffle_v3i64_v3i64__2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -2944,6 +2967,7 @@ define void @v_shuffle_v3i64_v3i64__2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -3080,6 +3104,7 @@ define void @v_shuffle_v3i64_v3i64__5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] @@ -3094,6 +3119,7 @@ define void @v_shuffle_v3i64_v3i64__5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] @@ -3108,6 +3134,7 @@ define void @v_shuffle_v3i64_v3i64__5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 ; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] @@ -3711,13 +3738,14 @@ define void @v_shuffle_v3i64_v3i64__5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -3725,13 +3753,14 @@ define void @v_shuffle_v3i64_v3i64__5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3739,13 +3768,14 @@ define void @v_shuffle_v3i64_v3i64__5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3i64_v3i64__5_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -4307,6 +4337,7 @@ define void @v_shuffle_v3i64_v3i64__5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -4321,6 +4352,7 @@ define void @v_shuffle_v3i64_v3i64__5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -4335,6 +4367,7 @@ define void @v_shuffle_v3i64_v3i64__5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -4694,6 +4727,7 @@ define void @s_shuffle_v3i64_v3i64__1_u_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -4707,6 +4741,7 @@ define void @s_shuffle_v3i64_v3i64__1_u_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -4716,6 +4751,7 @@ define void @s_shuffle_v3i64_v3i64__1_u_u() { ; ; GFX942-LABEL: s_shuffle_v3i64_v3i64__1_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -4737,8 +4773,11 @@ define void @s_shuffle_v3i64_v3i64__2_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -4748,8 +4787,11 @@ define void @s_shuffle_v3i64_v3i64__2_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -4757,6 +4799,7 @@ define void @s_shuffle_v3i64_v3i64__2_u_u() { ; ; GFX942-LABEL: s_shuffle_v3i64_v3i64__2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -4794,6 +4837,7 @@ define void @s_shuffle_v3i64_v3i64__4_u_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -4807,6 +4851,7 @@ define void @s_shuffle_v3i64_v3i64__4_u_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -4816,6 +4861,7 @@ define void @s_shuffle_v3i64_v3i64__4_u_u() { ; ; GFX942-LABEL: s_shuffle_v3i64_v3i64__4_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -4838,8 +4884,11 @@ define void @s_shuffle_v3i64_v3i64__5_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -4849,8 +4898,11 @@ define void @s_shuffle_v3i64_v3i64__5_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -4858,6 +4910,7 @@ define void @s_shuffle_v3i64_v3i64__5_u_u() { ; ; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -4883,10 +4936,11 @@ define void @s_shuffle_v3i64_v3i64__5_0_u() { ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -4901,10 +4955,11 @@ define void @s_shuffle_v3i64_v3i64__5_0_u() { ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -4916,12 +4971,15 @@ define void @s_shuffle_v3i64_v3i64__5_0_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s16 +; GFX942-NEXT: s_mov_b32 s9, s17 +; GFX942-NEXT: s_mov_b32 s10, s0 ; GFX942-NEXT: s_mov_b32 s11, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] @@ -4990,13 +5048,16 @@ define void @s_shuffle_v3i64_v3i64__5_2_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[20:25] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s8, s24 +; GFX900-NEXT: s_mov_b32 s9, s25 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -5006,13 +5067,16 @@ define void @s_shuffle_v3i64_v3i64__5_2_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[20:25] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s8, s24 +; GFX90A-NEXT: s_mov_b32 s9, s25 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -5022,13 +5086,14 @@ define void @s_shuffle_v3i64_v3i64__5_2_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 +; GFX942-NEXT: s_mov_b32 s8, s16 +; GFX942-NEXT: s_mov_b32 s9, s17 ; GFX942-NEXT: s_mov_b32 s10, s4 ; GFX942-NEXT: s_mov_b32 s11, s5 ; GFX942-NEXT: ;;#ASMSTART @@ -5045,12 +5110,15 @@ define void @s_shuffle_v3i64_v3i64__5_2_u() { define void @s_shuffle_v3i64_v3i64__5_3_u() { ; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[16:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s8, s20 +; GFX900-NEXT: s_mov_b32 s9, s21 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -5058,12 +5126,15 @@ define void @s_shuffle_v3i64_v3i64__5_3_u() { ; ; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[16:21] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s8, s20 +; GFX90A-NEXT: s_mov_b32 s9, s21 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -5071,6 +5142,7 @@ define void @s_shuffle_v3i64_v3i64__5_3_u() { ; ; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -5115,12 +5187,13 @@ define void @s_shuffle_v3i64_v3i64__5_5_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -5130,12 +5203,13 @@ define void @s_shuffle_v3i64_v3i64__5_5_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -5143,6 +5217,7 @@ define void @s_shuffle_v3i64_v3i64__5_5_u() { ; ; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_5_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -5498,6 +5573,7 @@ define void @s_shuffle_v3i64_v3i64__u_0_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: s_mov_b32 s12, s4 @@ -5513,6 +5589,7 @@ define void @s_shuffle_v3i64_v3i64__u_0_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: s_mov_b32 s12, s4 @@ -5524,6 +5601,7 @@ define void @s_shuffle_v3i64_v3i64__u_0_0() { ; ; GFX942-LABEL: s_shuffle_v3i64_v3i64__u_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -5680,6 +5758,7 @@ define void @s_shuffle_v3i64_v3i64__3_0_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: s_mov_b32 s12, s4 @@ -5695,6 +5774,7 @@ define void @s_shuffle_v3i64_v3i64__3_0_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: s_mov_b32 s12, s4 @@ -5706,6 +5786,7 @@ define void @s_shuffle_v3i64_v3i64__3_0_0() { ; ; GFX942-LABEL: s_shuffle_v3i64_v3i64__3_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -5864,10 +5945,11 @@ define void @s_shuffle_v3i64_v3i64__5_u_0() { ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: s_mov_b32 s12, s4 ; GFX900-NEXT: s_mov_b32 s13, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -5882,10 +5964,11 @@ define void @s_shuffle_v3i64_v3i64__5_u_0() { ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: s_mov_b32 s12, s4 ; GFX90A-NEXT: s_mov_b32 s13, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -5897,12 +5980,15 @@ define void @s_shuffle_v3i64_v3i64__5_u_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s16 +; GFX942-NEXT: s_mov_b32 s9, s17 +; GFX942-NEXT: s_mov_b32 s12, s0 ; GFX942-NEXT: s_mov_b32 s13, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] @@ -6390,10 +6476,11 @@ define void @s_shuffle_v3i64_v3i64__5_u_1() { ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: s_mov_b32 s12, s6 ; GFX900-NEXT: s_mov_b32 s13, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -6408,10 +6495,11 @@ define void @s_shuffle_v3i64_v3i64__5_u_1() { ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: s_mov_b32 s12, s6 ; GFX90A-NEXT: s_mov_b32 s13, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -6423,12 +6511,15 @@ define void @s_shuffle_v3i64_v3i64__5_u_1() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s16 +; GFX942-NEXT: s_mov_b32 s9, s17 +; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] @@ -7256,6 +7347,7 @@ define void @s_shuffle_v3i64_v3i64__1_3_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -7269,6 +7361,7 @@ define void @s_shuffle_v3i64_v3i64__1_3_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -7278,6 +7371,7 @@ define void @s_shuffle_v3i64_v3i64__1_3_3() { ; ; GFX942-LABEL: s_shuffle_v3i64_v3i64__1_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -7299,8 +7393,11 @@ define void @s_shuffle_v3i64_v3i64__2_3_3() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -7310,8 +7407,11 @@ define void @s_shuffle_v3i64_v3i64__2_3_3() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -7319,6 +7419,7 @@ define void @s_shuffle_v3i64_v3i64__2_3_3() { ; ; GFX942-LABEL: s_shuffle_v3i64_v3i64__2_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -7464,12 +7565,15 @@ define void @s_shuffle_v3i64_v3i64__5_3_3() { define void @s_shuffle_v3i64_v3i64__5_u_3() { ; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_u_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[16:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s8, s20 +; GFX900-NEXT: s_mov_b32 s9, s21 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -7477,12 +7581,15 @@ define void @s_shuffle_v3i64_v3i64__5_u_3() { ; ; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_u_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[16:21] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s8, s20 +; GFX90A-NEXT: s_mov_b32 s9, s21 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -7490,6 +7597,7 @@ define void @s_shuffle_v3i64_v3i64__5_u_3() { ; ; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_u_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -8014,10 +8122,13 @@ define void @s_shuffle_v3i64_v3i64__5_u_4() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8027,10 +8138,13 @@ define void @s_shuffle_v3i64_v3i64__5_u_4() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8038,6 +8152,7 @@ define void @s_shuffle_v3i64_v3i64__5_u_4() { ; ; GFX942-LABEL: s_shuffle_v3i64_v3i64__5_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll index 6e156d2d4a2f5..082ce443694a0 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll @@ -142,6 +142,7 @@ define void @v_shuffle_v3i64_v4i64__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -155,6 +156,7 @@ define void @v_shuffle_v3i64_v4i64__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -168,6 +170,7 @@ define void @v_shuffle_v3i64_v4i64__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -278,6 +281,7 @@ define void @v_shuffle_v3i64_v4i64__7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -291,6 +295,7 @@ define void @v_shuffle_v3i64_v4i64__7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -304,6 +309,7 @@ define void @v_shuffle_v3i64_v4i64__7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -1172,11 +1178,12 @@ define void @v_shuffle_v3i64_v4i64__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 ; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1186,11 +1193,12 @@ define void @v_shuffle_v3i64_v4i64__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1200,11 +1208,12 @@ define void @v_shuffle_v3i64_v4i64__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1418,11 +1427,12 @@ define void @v_shuffle_v3i64_v4i64__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 ; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1432,11 +1442,12 @@ define void @v_shuffle_v3i64_v4i64__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1446,11 +1457,12 @@ define void @v_shuffle_v3i64_v4i64__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1643,14 +1655,15 @@ define void @v_shuffle_v3i64_v4i64__7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v0, v8 ; GFX900-NEXT: v_mov_b32_e32 v1, v9 ; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] @@ -1660,14 +1673,15 @@ define void @v_shuffle_v3i64_v4i64__7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v0, v8 ; GFX90A-NEXT: v_mov_b32_e32 v1, v9 ; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] @@ -1677,15 +1691,15 @@ define void @v_shuffle_v3i64_v4i64__7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v0, v8 ; GFX942-NEXT: v_mov_b32_e32 v1, v9 ; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] @@ -2496,16 +2510,17 @@ define void @v_shuffle_v3i64_v4i64__7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v0, v10 ; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -2513,16 +2528,17 @@ define void @v_shuffle_v3i64_v4i64__7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_u_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v0, v10 ; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -2530,14 +2546,16 @@ define void @v_shuffle_v3i64_v4i64__7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_u_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v10 ; GFX942-NEXT: v_mov_b32_e32 v1, v11 ; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] @@ -3363,6 +3381,7 @@ define void @v_shuffle_v3i64_v4i64__7_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:13] @@ -3380,6 +3399,7 @@ define void @v_shuffle_v3i64_v4i64__7_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] @@ -3397,6 +3417,7 @@ define void @v_shuffle_v3i64_v4i64__7_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[6:13] @@ -4225,6 +4246,7 @@ define void @v_shuffle_v3i64_v4i64__7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[8:15] @@ -4242,6 +4264,7 @@ define void @v_shuffle_v3i64_v4i64__7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] @@ -4259,6 +4282,7 @@ define void @v_shuffle_v3i64_v4i64__7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[8:15] @@ -4774,6 +4798,7 @@ define void @v_shuffle_v3i64_v4i64__3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -4787,6 +4812,7 @@ define void @v_shuffle_v3i64_v4i64__3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -4800,6 +4826,7 @@ define void @v_shuffle_v3i64_v4i64__3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -4985,6 +5012,7 @@ define void @v_shuffle_v3i64_v4i64__7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 ; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] @@ -4999,6 +5027,7 @@ define void @v_shuffle_v3i64_v4i64__7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] @@ -5013,6 +5042,7 @@ define void @v_shuffle_v3i64_v4i64__7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 ; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -5830,13 +5860,14 @@ define void @v_shuffle_v3i64_v4i64__7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -5844,13 +5875,14 @@ define void @v_shuffle_v3i64_v4i64__7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_u_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -5858,13 +5890,14 @@ define void @v_shuffle_v3i64_v4i64__7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3i64_v4i64__7_u_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -6689,6 +6722,7 @@ define void @v_shuffle_v3i64_v4i64__7_u_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -6703,6 +6737,7 @@ define void @v_shuffle_v3i64_v4i64__7_u_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -6717,6 +6752,7 @@ define void @v_shuffle_v3i64_v4i64__7_u_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -7496,6 +7532,7 @@ define void @v_shuffle_v3i64_v4i64__7_u_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -7510,6 +7547,7 @@ define void @v_shuffle_v3i64_v4i64__7_u_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -7524,6 +7562,7 @@ define void @v_shuffle_v3i64_v4i64__7_u_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -7990,6 +8029,7 @@ define void @s_shuffle_v3i64_v4i64__1_u_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -8003,6 +8043,7 @@ define void @s_shuffle_v3i64_v4i64__1_u_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -8012,6 +8053,7 @@ define void @s_shuffle_v3i64_v4i64__1_u_u() { ; ; GFX942-LABEL: s_shuffle_v3i64_v4i64__1_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -8033,8 +8075,11 @@ define void @s_shuffle_v3i64_v4i64__2_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8044,8 +8089,11 @@ define void @s_shuffle_v3i64_v4i64__2_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8053,6 +8101,7 @@ define void @s_shuffle_v3i64_v4i64__2_u_u() { ; ; GFX942-LABEL: s_shuffle_v3i64_v4i64__2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -8074,10 +8123,11 @@ define void @s_shuffle_v3i64_v4i64__3_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8087,10 +8137,11 @@ define void @s_shuffle_v3i64_v4i64__3_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8098,6 +8149,7 @@ define void @s_shuffle_v3i64_v4i64__3_u_u() { ; ; GFX942-LABEL: s_shuffle_v3i64_v4i64__3_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -8135,6 +8187,7 @@ define void @s_shuffle_v3i64_v4i64__5_u_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -8148,6 +8201,7 @@ define void @s_shuffle_v3i64_v4i64__5_u_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -8157,6 +8211,7 @@ define void @s_shuffle_v3i64_v4i64__5_u_u() { ; ; GFX942-LABEL: s_shuffle_v3i64_v4i64__5_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -8179,8 +8234,11 @@ define void @s_shuffle_v3i64_v4i64__6_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8190,8 +8248,11 @@ define void @s_shuffle_v3i64_v4i64__6_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8199,6 +8260,7 @@ define void @s_shuffle_v3i64_v4i64__6_u_u() { ; ; GFX942-LABEL: s_shuffle_v3i64_v4i64__6_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -8221,10 +8283,11 @@ define void @s_shuffle_v3i64_v4i64__7_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8234,10 +8297,11 @@ define void @s_shuffle_v3i64_v4i64__7_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8245,6 +8309,7 @@ define void @s_shuffle_v3i64_v4i64__7_u_u() { ; ; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -8272,6 +8337,7 @@ define void @s_shuffle_v3i64_v4i64__7_0_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s10, s4 @@ -8290,6 +8356,7 @@ define void @s_shuffle_v3i64_v4i64__7_0_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s10, s4 @@ -8303,14 +8370,14 @@ define void @s_shuffle_v3i64_v4i64__7_0_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 ; GFX942-NEXT: s_mov_b32 s10, s0 ; GFX942-NEXT: s_mov_b32 s11, s1 ; GFX942-NEXT: ;;#ASMSTART @@ -8384,15 +8451,16 @@ define void @s_shuffle_v3i64_v4i64__7_2_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8402,15 +8470,16 @@ define void @s_shuffle_v3i64_v4i64__7_2_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8422,6 +8491,7 @@ define void @s_shuffle_v3i64_v4i64__7_2_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND @@ -8445,13 +8515,16 @@ define void @s_shuffle_v3i64_v4i64__7_3_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8461,13 +8534,16 @@ define void @s_shuffle_v3i64_v4i64__7_3_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8479,6 +8555,7 @@ define void @s_shuffle_v3i64_v4i64__7_3_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND @@ -8500,14 +8577,15 @@ define void @s_shuffle_v3i64_v4i64__7_3_u() { define void @s_shuffle_v3i64_v4i64__7_4_u() { ; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8515,14 +8593,15 @@ define void @s_shuffle_v3i64_v4i64__7_4_u() { ; ; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8530,6 +8609,7 @@ define void @s_shuffle_v3i64_v4i64__7_4_u() { ; ; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_4_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -8574,12 +8654,13 @@ define void @s_shuffle_v3i64_v4i64__7_6_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8589,12 +8670,13 @@ define void @s_shuffle_v3i64_v4i64__7_6_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8602,6 +8684,7 @@ define void @s_shuffle_v3i64_v4i64__7_6_u() { ; ; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_6_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -8626,10 +8709,13 @@ define void @s_shuffle_v3i64_v4i64__7_7_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8639,10 +8725,13 @@ define void @s_shuffle_v3i64_v4i64__7_7_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8650,6 +8739,7 @@ define void @s_shuffle_v3i64_v4i64__7_7_u() { ; ; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_7_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -9114,6 +9204,7 @@ define void @s_shuffle_v3i64_v4i64__u_0_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: s_mov_b32 s12, s4 @@ -9129,6 +9220,7 @@ define void @s_shuffle_v3i64_v4i64__u_0_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: s_mov_b32 s12, s4 @@ -9140,6 +9232,7 @@ define void @s_shuffle_v3i64_v4i64__u_0_0() { ; ; GFX942-LABEL: s_shuffle_v3i64_v4i64__u_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -9353,6 +9446,7 @@ define void @s_shuffle_v3i64_v4i64__4_0_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: s_mov_b32 s12, s4 @@ -9368,6 +9462,7 @@ define void @s_shuffle_v3i64_v4i64__4_0_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: s_mov_b32 s12, s4 @@ -9379,6 +9474,7 @@ define void @s_shuffle_v3i64_v4i64__4_0_0() { ; ; GFX942-LABEL: s_shuffle_v3i64_v4i64__4_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -9606,6 +9702,7 @@ define void @s_shuffle_v3i64_v4i64__7_u_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s12, s4 @@ -9624,6 +9721,7 @@ define void @s_shuffle_v3i64_v4i64__7_u_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s12, s4 @@ -9637,14 +9735,15 @@ define void @s_shuffle_v3i64_v4i64__7_u_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s12, s0 ; GFX942-NEXT: s_mov_b32 s13, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] @@ -10358,6 +10457,7 @@ define void @s_shuffle_v3i64_v4i64__7_u_1() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s12, s6 @@ -10376,6 +10476,7 @@ define void @s_shuffle_v3i64_v4i64__7_u_1() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s12, s6 @@ -10389,14 +10490,15 @@ define void @s_shuffle_v3i64_v4i64__7_u_1() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] @@ -11528,6 +11630,7 @@ define void @s_shuffle_v3i64_v4i64__u_3_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s10, s14 ; GFX900-NEXT: s_mov_b32 s11, s15 ; GFX900-NEXT: s_mov_b32 s12, s14 @@ -11543,6 +11646,7 @@ define void @s_shuffle_v3i64_v4i64__u_3_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s10, s14 ; GFX90A-NEXT: s_mov_b32 s11, s15 ; GFX90A-NEXT: s_mov_b32 s12, s14 @@ -11554,6 +11658,7 @@ define void @s_shuffle_v3i64_v4i64__u_3_3() { ; ; GFX942-LABEL: s_shuffle_v3i64_v4i64__u_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -11771,6 +11876,7 @@ define void @s_shuffle_v3i64_v4i64__4_3_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s10, s14 ; GFX900-NEXT: s_mov_b32 s11, s15 ; GFX900-NEXT: s_mov_b32 s12, s14 @@ -11786,6 +11892,7 @@ define void @s_shuffle_v3i64_v4i64__4_3_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s10, s14 ; GFX90A-NEXT: s_mov_b32 s11, s15 ; GFX90A-NEXT: s_mov_b32 s12, s14 @@ -11797,6 +11904,7 @@ define void @s_shuffle_v3i64_v4i64__4_3_3() { ; ; GFX942-LABEL: s_shuffle_v3i64_v4i64__4_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -12017,15 +12125,16 @@ define void @s_shuffle_v3i64_v4i64__7_u_3() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -12035,15 +12144,16 @@ define void @s_shuffle_v3i64_v4i64__7_u_3() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -12055,6 +12165,7 @@ define void @s_shuffle_v3i64_v4i64__7_u_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND @@ -12524,6 +12635,7 @@ define void @s_shuffle_v3i64_v4i64__1_4_4() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -12537,6 +12649,7 @@ define void @s_shuffle_v3i64_v4i64__1_4_4() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -12546,6 +12659,7 @@ define void @s_shuffle_v3i64_v4i64__1_4_4() { ; ; GFX942-LABEL: s_shuffle_v3i64_v4i64__1_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -12567,8 +12681,11 @@ define void @s_shuffle_v3i64_v4i64__2_4_4() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -12578,8 +12695,11 @@ define void @s_shuffle_v3i64_v4i64__2_4_4() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -12587,6 +12707,7 @@ define void @s_shuffle_v3i64_v4i64__2_4_4() { ; ; GFX942-LABEL: s_shuffle_v3i64_v4i64__2_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -12608,10 +12729,11 @@ define void @s_shuffle_v3i64_v4i64__3_4_4() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -12621,10 +12743,11 @@ define void @s_shuffle_v3i64_v4i64__3_4_4() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -12632,6 +12755,7 @@ define void @s_shuffle_v3i64_v4i64__3_4_4() { ; ; GFX942-LABEL: s_shuffle_v3i64_v4i64__3_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -12835,14 +12959,15 @@ define void @s_shuffle_v3i64_v4i64__7_4_4() { define void @s_shuffle_v3i64_v4i64__7_u_4() { ; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -12850,14 +12975,15 @@ define void @s_shuffle_v3i64_v4i64__7_u_4() { ; ; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -12865,6 +12991,7 @@ define void @s_shuffle_v3i64_v4i64__7_u_4() { ; ; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13594,12 +13721,13 @@ define void @s_shuffle_v3i64_v4i64__7_u_5() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -13609,12 +13737,13 @@ define void @s_shuffle_v3i64_v4i64__7_u_5() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -13622,6 +13751,7 @@ define void @s_shuffle_v3i64_v4i64__7_u_5() { ; ; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -14715,6 +14845,7 @@ define void @s_shuffle_v3i64_v4i64__u_7_7() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s10, s14 ; GFX900-NEXT: s_mov_b32 s11, s15 ; GFX900-NEXT: s_mov_b32 s12, s14 @@ -14730,6 +14861,7 @@ define void @s_shuffle_v3i64_v4i64__u_7_7() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s10, s14 ; GFX90A-NEXT: s_mov_b32 s11, s15 ; GFX90A-NEXT: s_mov_b32 s12, s14 @@ -14741,6 +14873,7 @@ define void @s_shuffle_v3i64_v4i64__u_7_7() { ; ; GFX942-LABEL: s_shuffle_v3i64_v4i64__u_7_7: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -15161,12 +15294,13 @@ define void @s_shuffle_v3i64_v4i64__7_u_7() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -15176,12 +15310,13 @@ define void @s_shuffle_v3i64_v4i64__7_u_7() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -15189,6 +15324,7 @@ define void @s_shuffle_v3i64_v4i64__7_u_7() { ; ; GFX942-LABEL: s_shuffle_v3i64_v4i64__7_u_7: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll index fe132493ce536..815a23f273f0d 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll @@ -57,40 +57,44 @@ define void @v_shuffle_v3p0_v2p0__0_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3p0_v2p0__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v2p0__1_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__1_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__1_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -113,40 +117,44 @@ define void @v_shuffle_v3p0_v2p0__2_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3p0_v2p0__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -588,11 +596,12 @@ define void @v_shuffle_v3p0_v2p0__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -602,11 +611,12 @@ define void @v_shuffle_v3p0_v2p0__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -616,11 +626,12 @@ define void @v_shuffle_v3p0_v2p0__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -732,11 +743,12 @@ define void @v_shuffle_v3p0_v2p0__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -746,11 +758,12 @@ define void @v_shuffle_v3p0_v2p0__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -760,11 +773,12 @@ define void @v_shuffle_v3p0_v2p0__2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -841,14 +855,15 @@ define void @v_shuffle_v3p0_v2p0__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] @@ -858,14 +873,15 @@ define void @v_shuffle_v3p0_v2p0__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] @@ -875,15 +891,15 @@ define void @v_shuffle_v3p0_v2p0__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 ; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] @@ -1249,16 +1265,17 @@ define void @v_shuffle_v3p0_v2p0__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1266,16 +1283,17 @@ define void @v_shuffle_v3p0_v2p0__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_u_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1283,14 +1301,16 @@ define void @v_shuffle_v3p0_v2p0__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_u_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 ; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -1484,40 +1504,44 @@ define void @v_shuffle_v3p0_v2p0__0_2_2(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3p0_v2p0__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v2p0__1_2_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__1_2_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__1_2_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1589,43 +1613,46 @@ define void @v_shuffle_v3p0_v2p0__3_2_2(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3p0_v2p0__3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1962,43 +1989,46 @@ define void @v_shuffle_v3p0_v2p0__2_3_3(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3p0_v2p0__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_u_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_u_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p0_v2p0__3_u_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2236,6 +2266,7 @@ define void @s_shuffle_v3p0_v2p0__0_u_u() { define void @s_shuffle_v3p0_v2p0__1_u_u() { ; GFX900-LABEL: s_shuffle_v3p0_v2p0__1_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -2249,6 +2280,7 @@ define void @s_shuffle_v3p0_v2p0__1_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3p0_v2p0__1_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -2262,6 +2294,7 @@ define void @s_shuffle_v3p0_v2p0__1_u_u() { ; ; GFX942-LABEL: s_shuffle_v3p0_v2p0__1_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -2295,6 +2328,7 @@ define void @s_shuffle_v3p0_v2p0__2_u_u() { define void @s_shuffle_v3p0_v2p0__3_u_u() { ; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -2308,6 +2342,7 @@ define void @s_shuffle_v3p0_v2p0__3_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -2321,6 +2356,7 @@ define void @s_shuffle_v3p0_v2p0__3_u_u() { ; ; GFX942-LABEL: s_shuffle_v3p0_v2p0__3_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -2343,13 +2379,14 @@ define void @s_shuffle_v3p0_v2p0__3_0_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -2361,13 +2398,14 @@ define void @s_shuffle_v3p0_v2p0__3_0_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -2377,6 +2415,7 @@ define void @s_shuffle_v3p0_v2p0__3_0_u() { ; ; GFX942-LABEL: s_shuffle_v3p0_v2p0__3_0_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -2457,6 +2496,7 @@ define void @s_shuffle_v3p0_v2p0__3_1_u() { define void @s_shuffle_v3p0_v2p0__3_2_u() { ; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_2_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -2472,6 +2512,7 @@ define void @s_shuffle_v3p0_v2p0__3_2_u() { ; ; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_2_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -2487,6 +2528,7 @@ define void @s_shuffle_v3p0_v2p0__3_2_u() { ; ; GFX942-LABEL: s_shuffle_v3p0_v2p0__3_2_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -2731,6 +2773,7 @@ define void @s_shuffle_v3p0_v2p0__3_3_3() { define void @s_shuffle_v3p0_v2p0__u_0_0() { ; GFX900-LABEL: s_shuffle_v3p0_v2p0__u_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -2746,6 +2789,7 @@ define void @s_shuffle_v3p0_v2p0__u_0_0() { ; ; GFX90A-LABEL: s_shuffle_v3p0_v2p0__u_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -2761,6 +2805,7 @@ define void @s_shuffle_v3p0_v2p0__u_0_0() { ; ; GFX942-LABEL: s_shuffle_v3p0_v2p0__u_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -2860,6 +2905,7 @@ define void @s_shuffle_v3p0_v2p0__1_0_0() { define void @s_shuffle_v3p0_v2p0__2_0_0() { ; GFX900-LABEL: s_shuffle_v3p0_v2p0__2_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -2875,6 +2921,7 @@ define void @s_shuffle_v3p0_v2p0__2_0_0() { ; ; GFX90A-LABEL: s_shuffle_v3p0_v2p0__2_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -2890,6 +2937,7 @@ define void @s_shuffle_v3p0_v2p0__2_0_0() { ; ; GFX942-LABEL: s_shuffle_v3p0_v2p0__2_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -2980,13 +3028,14 @@ define void @s_shuffle_v3p0_v2p0__3_u_0() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s12, s4 ; GFX900-NEXT: s_mov_b32 s13, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -2998,13 +3047,14 @@ define void @s_shuffle_v3p0_v2p0__3_u_0() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s12, s4 ; GFX90A-NEXT: s_mov_b32 s13, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -3014,6 +3064,7 @@ define void @s_shuffle_v3p0_v2p0__3_u_0() { ; ; GFX942-LABEL: s_shuffle_v3p0_v2p0__3_u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -3314,13 +3365,14 @@ define void @s_shuffle_v3p0_v2p0__3_u_1() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s12, s6 ; GFX900-NEXT: s_mov_b32 s13, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -3332,13 +3384,14 @@ define void @s_shuffle_v3p0_v2p0__3_u_1() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s12, s6 ; GFX90A-NEXT: s_mov_b32 s13, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -3348,6 +3401,7 @@ define void @s_shuffle_v3p0_v2p0__3_u_1() { ; ; GFX942-LABEL: s_shuffle_v3p0_v2p0__3_u_1: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -3561,6 +3615,7 @@ define void @s_shuffle_v3p0_v2p0__0_2_2() { define void @s_shuffle_v3p0_v2p0__1_2_2() { ; GFX900-LABEL: s_shuffle_v3p0_v2p0__1_2_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -3574,6 +3629,7 @@ define void @s_shuffle_v3p0_v2p0__1_2_2() { ; ; GFX90A-LABEL: s_shuffle_v3p0_v2p0__1_2_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -3587,6 +3643,7 @@ define void @s_shuffle_v3p0_v2p0__1_2_2() { ; ; GFX942-LABEL: s_shuffle_v3p0_v2p0__1_2_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -3678,6 +3735,7 @@ define void @s_shuffle_v3p0_v2p0__3_2_2() { define void @s_shuffle_v3p0_v2p0__3_u_2() { ; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -3693,6 +3751,7 @@ define void @s_shuffle_v3p0_v2p0__3_u_2() { ; ; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -3708,6 +3767,7 @@ define void @s_shuffle_v3p0_v2p0__3_u_2() { ; ; GFX942-LABEL: s_shuffle_v3p0_v2p0__3_u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -4016,6 +4076,7 @@ define void @s_shuffle_v3p0_v2p0__2_3_3() { define void @s_shuffle_v3p0_v2p0__3_u_3() { ; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_u_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -4031,6 +4092,7 @@ define void @s_shuffle_v3p0_v2p0__3_u_3() { ; ; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_u_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -4046,6 +4108,7 @@ define void @s_shuffle_v3p0_v2p0__3_u_3() { ; ; GFX942-LABEL: s_shuffle_v3p0_v2p0__3_u_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v3p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v3p0.ll index b6f4e3091b61f..56cd051df3e55 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v3p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v3p0.ll @@ -103,6 +103,7 @@ define void @v_shuffle_v3p0_v3p0__2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -116,6 +117,7 @@ define void @v_shuffle_v3p0_v3p0__2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -129,6 +131,7 @@ define void @v_shuffle_v3p0_v3p0__2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -199,6 +202,7 @@ define void @v_shuffle_v3p0_v3p0__5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -212,6 +216,7 @@ define void @v_shuffle_v3p0_v3p0__5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -225,6 +230,7 @@ define void @v_shuffle_v3p0_v3p0__5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -884,11 +890,12 @@ define void @v_shuffle_v3p0_v3p0__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 ; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -898,11 +905,12 @@ define void @v_shuffle_v3p0_v3p0__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -912,11 +920,12 @@ define void @v_shuffle_v3p0_v3p0__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1082,11 +1091,12 @@ define void @v_shuffle_v3p0_v3p0__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 ; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1096,11 +1106,12 @@ define void @v_shuffle_v3p0_v3p0__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1110,11 +1121,12 @@ define void @v_shuffle_v3p0_v3p0__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1249,14 +1261,15 @@ define void @v_shuffle_v3p0_v3p0__5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 ; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] @@ -1266,14 +1279,15 @@ define void @v_shuffle_v3p0_v3p0__5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] @@ -1283,15 +1297,15 @@ define void @v_shuffle_v3p0_v3p0__5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 ; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -1880,16 +1894,17 @@ define void @v_shuffle_v3p0_v3p0__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v0, v8 ; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1897,16 +1912,17 @@ define void @v_shuffle_v3p0_v3p0__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_u_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v0, v8 ; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1914,14 +1930,16 @@ define void @v_shuffle_v3p0_v3p0__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_u_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v8 ; GFX942-NEXT: v_mov_b32_e32 v1, v9 ; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] @@ -2524,6 +2542,7 @@ define void @v_shuffle_v3p0_v3p0__5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:11] @@ -2541,6 +2560,7 @@ define void @v_shuffle_v3p0_v3p0__5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:11] @@ -2558,6 +2578,7 @@ define void @v_shuffle_v3p0_v3p0__5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[6:11] @@ -2918,6 +2939,7 @@ define void @v_shuffle_v3p0_v3p0__2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -2931,6 +2953,7 @@ define void @v_shuffle_v3p0_v3p0__2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -2944,6 +2967,7 @@ define void @v_shuffle_v3p0_v3p0__2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -3080,6 +3104,7 @@ define void @v_shuffle_v3p0_v3p0__5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] @@ -3094,6 +3119,7 @@ define void @v_shuffle_v3p0_v3p0__5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] @@ -3108,6 +3134,7 @@ define void @v_shuffle_v3p0_v3p0__5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 ; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] @@ -3711,13 +3738,14 @@ define void @v_shuffle_v3p0_v3p0__5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_u_4: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -3725,13 +3753,14 @@ define void @v_shuffle_v3p0_v3p0__5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_u_4: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3739,13 +3768,14 @@ define void @v_shuffle_v3p0_v3p0__5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3p0_v3p0__5_u_4: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -4307,6 +4337,7 @@ define void @v_shuffle_v3p0_v3p0__5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -4321,6 +4352,7 @@ define void @v_shuffle_v3p0_v3p0__5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -4335,6 +4367,7 @@ define void @v_shuffle_v3p0_v3p0__5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -4694,6 +4727,7 @@ define void @s_shuffle_v3p0_v3p0__1_u_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -4707,6 +4741,7 @@ define void @s_shuffle_v3p0_v3p0__1_u_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -4716,6 +4751,7 @@ define void @s_shuffle_v3p0_v3p0__1_u_u() { ; ; GFX942-LABEL: s_shuffle_v3p0_v3p0__1_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -4737,8 +4773,11 @@ define void @s_shuffle_v3p0_v3p0__2_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -4748,8 +4787,11 @@ define void @s_shuffle_v3p0_v3p0__2_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -4757,6 +4799,7 @@ define void @s_shuffle_v3p0_v3p0__2_u_u() { ; ; GFX942-LABEL: s_shuffle_v3p0_v3p0__2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -4794,6 +4837,7 @@ define void @s_shuffle_v3p0_v3p0__4_u_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -4807,6 +4851,7 @@ define void @s_shuffle_v3p0_v3p0__4_u_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -4816,6 +4861,7 @@ define void @s_shuffle_v3p0_v3p0__4_u_u() { ; ; GFX942-LABEL: s_shuffle_v3p0_v3p0__4_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -4838,8 +4884,11 @@ define void @s_shuffle_v3p0_v3p0__5_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -4849,8 +4898,11 @@ define void @s_shuffle_v3p0_v3p0__5_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -4858,6 +4910,7 @@ define void @s_shuffle_v3p0_v3p0__5_u_u() { ; ; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -4883,10 +4936,11 @@ define void @s_shuffle_v3p0_v3p0__5_0_u() { ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -4901,10 +4955,11 @@ define void @s_shuffle_v3p0_v3p0__5_0_u() { ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -4916,12 +4971,15 @@ define void @s_shuffle_v3p0_v3p0__5_0_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s16 +; GFX942-NEXT: s_mov_b32 s9, s17 +; GFX942-NEXT: s_mov_b32 s10, s0 ; GFX942-NEXT: s_mov_b32 s11, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] @@ -4990,13 +5048,16 @@ define void @s_shuffle_v3p0_v3p0__5_2_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[20:25] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s8, s24 +; GFX900-NEXT: s_mov_b32 s9, s25 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -5006,13 +5067,16 @@ define void @s_shuffle_v3p0_v3p0__5_2_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[20:25] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s8, s24 +; GFX90A-NEXT: s_mov_b32 s9, s25 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -5022,13 +5086,14 @@ define void @s_shuffle_v3p0_v3p0__5_2_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 +; GFX942-NEXT: s_mov_b32 s8, s16 +; GFX942-NEXT: s_mov_b32 s9, s17 ; GFX942-NEXT: s_mov_b32 s10, s4 ; GFX942-NEXT: s_mov_b32 s11, s5 ; GFX942-NEXT: ;;#ASMSTART @@ -5045,12 +5110,15 @@ define void @s_shuffle_v3p0_v3p0__5_2_u() { define void @s_shuffle_v3p0_v3p0__5_3_u() { ; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[16:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s8, s20 +; GFX900-NEXT: s_mov_b32 s9, s21 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -5058,12 +5126,15 @@ define void @s_shuffle_v3p0_v3p0__5_3_u() { ; ; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[16:21] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s8, s20 +; GFX90A-NEXT: s_mov_b32 s9, s21 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -5071,6 +5142,7 @@ define void @s_shuffle_v3p0_v3p0__5_3_u() { ; ; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -5115,12 +5187,13 @@ define void @s_shuffle_v3p0_v3p0__5_5_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -5130,12 +5203,13 @@ define void @s_shuffle_v3p0_v3p0__5_5_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -5143,6 +5217,7 @@ define void @s_shuffle_v3p0_v3p0__5_5_u() { ; ; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_5_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -5498,6 +5573,7 @@ define void @s_shuffle_v3p0_v3p0__u_0_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: s_mov_b32 s12, s4 @@ -5513,6 +5589,7 @@ define void @s_shuffle_v3p0_v3p0__u_0_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: s_mov_b32 s12, s4 @@ -5524,6 +5601,7 @@ define void @s_shuffle_v3p0_v3p0__u_0_0() { ; ; GFX942-LABEL: s_shuffle_v3p0_v3p0__u_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -5680,6 +5758,7 @@ define void @s_shuffle_v3p0_v3p0__3_0_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: s_mov_b32 s12, s4 @@ -5695,6 +5774,7 @@ define void @s_shuffle_v3p0_v3p0__3_0_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: s_mov_b32 s12, s4 @@ -5706,6 +5786,7 @@ define void @s_shuffle_v3p0_v3p0__3_0_0() { ; ; GFX942-LABEL: s_shuffle_v3p0_v3p0__3_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -5864,10 +5945,11 @@ define void @s_shuffle_v3p0_v3p0__5_u_0() { ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: s_mov_b32 s12, s4 ; GFX900-NEXT: s_mov_b32 s13, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -5882,10 +5964,11 @@ define void @s_shuffle_v3p0_v3p0__5_u_0() { ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: s_mov_b32 s12, s4 ; GFX90A-NEXT: s_mov_b32 s13, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -5897,12 +5980,15 @@ define void @s_shuffle_v3p0_v3p0__5_u_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s16 +; GFX942-NEXT: s_mov_b32 s9, s17 +; GFX942-NEXT: s_mov_b32 s12, s0 ; GFX942-NEXT: s_mov_b32 s13, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] @@ -6390,10 +6476,11 @@ define void @s_shuffle_v3p0_v3p0__5_u_1() { ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: s_mov_b32 s12, s6 ; GFX900-NEXT: s_mov_b32 s13, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -6408,10 +6495,11 @@ define void @s_shuffle_v3p0_v3p0__5_u_1() { ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: s_mov_b32 s12, s6 ; GFX90A-NEXT: s_mov_b32 s13, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -6423,12 +6511,15 @@ define void @s_shuffle_v3p0_v3p0__5_u_1() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s16 +; GFX942-NEXT: s_mov_b32 s9, s17 +; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] @@ -7256,6 +7347,7 @@ define void @s_shuffle_v3p0_v3p0__1_3_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -7269,6 +7361,7 @@ define void @s_shuffle_v3p0_v3p0__1_3_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -7278,6 +7371,7 @@ define void @s_shuffle_v3p0_v3p0__1_3_3() { ; ; GFX942-LABEL: s_shuffle_v3p0_v3p0__1_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -7299,8 +7393,11 @@ define void @s_shuffle_v3p0_v3p0__2_3_3() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -7310,8 +7407,11 @@ define void @s_shuffle_v3p0_v3p0__2_3_3() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -7319,6 +7419,7 @@ define void @s_shuffle_v3p0_v3p0__2_3_3() { ; ; GFX942-LABEL: s_shuffle_v3p0_v3p0__2_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -7464,12 +7565,15 @@ define void @s_shuffle_v3p0_v3p0__5_3_3() { define void @s_shuffle_v3p0_v3p0__5_u_3() { ; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_u_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[16:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s8, s20 +; GFX900-NEXT: s_mov_b32 s9, s21 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -7477,12 +7581,15 @@ define void @s_shuffle_v3p0_v3p0__5_u_3() { ; ; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_u_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[16:21] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s8, s20 +; GFX90A-NEXT: s_mov_b32 s9, s21 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -7490,6 +7597,7 @@ define void @s_shuffle_v3p0_v3p0__5_u_3() { ; ; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_u_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -8014,10 +8122,13 @@ define void @s_shuffle_v3p0_v3p0__5_u_4() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8027,10 +8138,13 @@ define void @s_shuffle_v3p0_v3p0__5_u_4() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8038,6 +8152,7 @@ define void @s_shuffle_v3p0_v3p0__5_u_4() { ; ; GFX942-LABEL: s_shuffle_v3p0_v3p0__5_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll index b03066e66cf66..d979f76e404ad 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll @@ -142,6 +142,7 @@ define void @v_shuffle_v3p0_v4p0__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -155,6 +156,7 @@ define void @v_shuffle_v3p0_v4p0__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -168,6 +170,7 @@ define void @v_shuffle_v3p0_v4p0__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -278,6 +281,7 @@ define void @v_shuffle_v3p0_v4p0__7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -291,6 +295,7 @@ define void @v_shuffle_v3p0_v4p0__7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -304,6 +309,7 @@ define void @v_shuffle_v3p0_v4p0__7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -1172,11 +1178,12 @@ define void @v_shuffle_v3p0_v4p0__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 ; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1186,11 +1193,12 @@ define void @v_shuffle_v3p0_v4p0__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1200,11 +1208,12 @@ define void @v_shuffle_v3p0_v4p0__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1418,11 +1427,12 @@ define void @v_shuffle_v3p0_v4p0__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 ; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1432,11 +1442,12 @@ define void @v_shuffle_v3p0_v4p0__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1446,11 +1457,12 @@ define void @v_shuffle_v3p0_v4p0__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1643,14 +1655,15 @@ define void @v_shuffle_v3p0_v4p0__7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_u_0: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v0, v8 ; GFX900-NEXT: v_mov_b32_e32 v1, v9 ; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] @@ -1660,14 +1673,15 @@ define void @v_shuffle_v3p0_v4p0__7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_u_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v0, v8 ; GFX90A-NEXT: v_mov_b32_e32 v1, v9 ; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] @@ -1677,15 +1691,15 @@ define void @v_shuffle_v3p0_v4p0__7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_u_0: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v0, v8 ; GFX942-NEXT: v_mov_b32_e32 v1, v9 ; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] @@ -2496,16 +2510,17 @@ define void @v_shuffle_v3p0_v4p0__7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_u_1: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v0, v10 ; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -2513,16 +2528,17 @@ define void @v_shuffle_v3p0_v4p0__7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_u_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v0, v10 ; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -2530,14 +2546,16 @@ define void @v_shuffle_v3p0_v4p0__7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_u_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v10 ; GFX942-NEXT: v_mov_b32_e32 v1, v11 ; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] @@ -3363,6 +3381,7 @@ define void @v_shuffle_v3p0_v4p0__7_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:13] @@ -3380,6 +3399,7 @@ define void @v_shuffle_v3p0_v4p0__7_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] @@ -3397,6 +3417,7 @@ define void @v_shuffle_v3p0_v4p0__7_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[6:13] @@ -4225,6 +4246,7 @@ define void @v_shuffle_v3p0_v4p0__7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[8:15] @@ -4242,6 +4264,7 @@ define void @v_shuffle_v3p0_v4p0__7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] @@ -4259,6 +4282,7 @@ define void @v_shuffle_v3p0_v4p0__7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[8:15] @@ -4774,6 +4798,7 @@ define void @v_shuffle_v3p0_v4p0__3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -4787,6 +4812,7 @@ define void @v_shuffle_v3p0_v4p0__3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -4800,6 +4826,7 @@ define void @v_shuffle_v3p0_v4p0__3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -4985,6 +5012,7 @@ define void @v_shuffle_v3p0_v4p0__7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 ; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] @@ -4999,6 +5027,7 @@ define void @v_shuffle_v3p0_v4p0__7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] @@ -5013,6 +5042,7 @@ define void @v_shuffle_v3p0_v4p0__7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 ; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -5830,13 +5860,14 @@ define void @v_shuffle_v3p0_v4p0__7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -5844,13 +5875,14 @@ define void @v_shuffle_v3p0_v4p0__7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_u_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -5858,13 +5890,14 @@ define void @v_shuffle_v3p0_v4p0__7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX942-LABEL: v_shuffle_v3p0_v4p0__7_u_5: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -6689,6 +6722,7 @@ define void @v_shuffle_v3p0_v4p0__7_u_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -6703,6 +6737,7 @@ define void @v_shuffle_v3p0_v4p0__7_u_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -6717,6 +6752,7 @@ define void @v_shuffle_v3p0_v4p0__7_u_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -7496,6 +7532,7 @@ define void @v_shuffle_v3p0_v4p0__7_u_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -7510,6 +7547,7 @@ define void @v_shuffle_v3p0_v4p0__7_u_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -7524,6 +7562,7 @@ define void @v_shuffle_v3p0_v4p0__7_u_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -7990,6 +8029,7 @@ define void @s_shuffle_v3p0_v4p0__1_u_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -8003,6 +8043,7 @@ define void @s_shuffle_v3p0_v4p0__1_u_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -8012,6 +8053,7 @@ define void @s_shuffle_v3p0_v4p0__1_u_u() { ; ; GFX942-LABEL: s_shuffle_v3p0_v4p0__1_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -8033,8 +8075,11 @@ define void @s_shuffle_v3p0_v4p0__2_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8044,8 +8089,11 @@ define void @s_shuffle_v3p0_v4p0__2_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8053,6 +8101,7 @@ define void @s_shuffle_v3p0_v4p0__2_u_u() { ; ; GFX942-LABEL: s_shuffle_v3p0_v4p0__2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -8074,10 +8123,11 @@ define void @s_shuffle_v3p0_v4p0__3_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8087,10 +8137,11 @@ define void @s_shuffle_v3p0_v4p0__3_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8098,6 +8149,7 @@ define void @s_shuffle_v3p0_v4p0__3_u_u() { ; ; GFX942-LABEL: s_shuffle_v3p0_v4p0__3_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -8135,6 +8187,7 @@ define void @s_shuffle_v3p0_v4p0__5_u_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -8148,6 +8201,7 @@ define void @s_shuffle_v3p0_v4p0__5_u_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -8157,6 +8211,7 @@ define void @s_shuffle_v3p0_v4p0__5_u_u() { ; ; GFX942-LABEL: s_shuffle_v3p0_v4p0__5_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -8179,8 +8234,11 @@ define void @s_shuffle_v3p0_v4p0__6_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8190,8 +8248,11 @@ define void @s_shuffle_v3p0_v4p0__6_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8199,6 +8260,7 @@ define void @s_shuffle_v3p0_v4p0__6_u_u() { ; ; GFX942-LABEL: s_shuffle_v3p0_v4p0__6_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -8221,10 +8283,11 @@ define void @s_shuffle_v3p0_v4p0__7_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8234,10 +8297,11 @@ define void @s_shuffle_v3p0_v4p0__7_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8245,6 +8309,7 @@ define void @s_shuffle_v3p0_v4p0__7_u_u() { ; ; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -8272,6 +8337,7 @@ define void @s_shuffle_v3p0_v4p0__7_0_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s10, s4 @@ -8290,6 +8356,7 @@ define void @s_shuffle_v3p0_v4p0__7_0_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s10, s4 @@ -8303,14 +8370,14 @@ define void @s_shuffle_v3p0_v4p0__7_0_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 ; GFX942-NEXT: s_mov_b32 s10, s0 ; GFX942-NEXT: s_mov_b32 s11, s1 ; GFX942-NEXT: ;;#ASMSTART @@ -8384,15 +8451,16 @@ define void @s_shuffle_v3p0_v4p0__7_2_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8402,15 +8470,16 @@ define void @s_shuffle_v3p0_v4p0__7_2_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8422,6 +8491,7 @@ define void @s_shuffle_v3p0_v4p0__7_2_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND @@ -8445,13 +8515,16 @@ define void @s_shuffle_v3p0_v4p0__7_3_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8461,13 +8534,16 @@ define void @s_shuffle_v3p0_v4p0__7_3_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8479,6 +8555,7 @@ define void @s_shuffle_v3p0_v4p0__7_3_u() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND @@ -8500,14 +8577,15 @@ define void @s_shuffle_v3p0_v4p0__7_3_u() { define void @s_shuffle_v3p0_v4p0__7_4_u() { ; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8515,14 +8593,15 @@ define void @s_shuffle_v3p0_v4p0__7_4_u() { ; ; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8530,6 +8609,7 @@ define void @s_shuffle_v3p0_v4p0__7_4_u() { ; ; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_4_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -8574,12 +8654,13 @@ define void @s_shuffle_v3p0_v4p0__7_6_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8589,12 +8670,13 @@ define void @s_shuffle_v3p0_v4p0__7_6_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8602,6 +8684,7 @@ define void @s_shuffle_v3p0_v4p0__7_6_u() { ; ; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_6_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -8626,10 +8709,13 @@ define void @s_shuffle_v3p0_v4p0__7_7_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -8639,10 +8725,13 @@ define void @s_shuffle_v3p0_v4p0__7_7_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -8650,6 +8739,7 @@ define void @s_shuffle_v3p0_v4p0__7_7_u() { ; ; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_7_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -9114,6 +9204,7 @@ define void @s_shuffle_v3p0_v4p0__u_0_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: s_mov_b32 s12, s4 @@ -9129,6 +9220,7 @@ define void @s_shuffle_v3p0_v4p0__u_0_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: s_mov_b32 s12, s4 @@ -9140,6 +9232,7 @@ define void @s_shuffle_v3p0_v4p0__u_0_0() { ; ; GFX942-LABEL: s_shuffle_v3p0_v4p0__u_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -9353,6 +9446,7 @@ define void @s_shuffle_v3p0_v4p0__4_0_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: s_mov_b32 s12, s4 @@ -9368,6 +9462,7 @@ define void @s_shuffle_v3p0_v4p0__4_0_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: s_mov_b32 s12, s4 @@ -9379,6 +9474,7 @@ define void @s_shuffle_v3p0_v4p0__4_0_0() { ; ; GFX942-LABEL: s_shuffle_v3p0_v4p0__4_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -9606,6 +9702,7 @@ define void @s_shuffle_v3p0_v4p0__7_u_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s12, s4 @@ -9624,6 +9721,7 @@ define void @s_shuffle_v3p0_v4p0__7_u_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s12, s4 @@ -9637,14 +9735,15 @@ define void @s_shuffle_v3p0_v4p0__7_u_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s12, s0 ; GFX942-NEXT: s_mov_b32 s13, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] @@ -10358,6 +10457,7 @@ define void @s_shuffle_v3p0_v4p0__7_u_1() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s12, s6 @@ -10376,6 +10476,7 @@ define void @s_shuffle_v3p0_v4p0__7_u_1() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s12, s6 @@ -10389,14 +10490,15 @@ define void @s_shuffle_v3p0_v4p0__7_u_1() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:13] @@ -11528,6 +11630,7 @@ define void @s_shuffle_v3p0_v4p0__u_3_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s10, s14 ; GFX900-NEXT: s_mov_b32 s11, s15 ; GFX900-NEXT: s_mov_b32 s12, s14 @@ -11543,6 +11646,7 @@ define void @s_shuffle_v3p0_v4p0__u_3_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s10, s14 ; GFX90A-NEXT: s_mov_b32 s11, s15 ; GFX90A-NEXT: s_mov_b32 s12, s14 @@ -11554,6 +11658,7 @@ define void @s_shuffle_v3p0_v4p0__u_3_3() { ; ; GFX942-LABEL: s_shuffle_v3p0_v4p0__u_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -11771,6 +11876,7 @@ define void @s_shuffle_v3p0_v4p0__4_3_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s10, s14 ; GFX900-NEXT: s_mov_b32 s11, s15 ; GFX900-NEXT: s_mov_b32 s12, s14 @@ -11786,6 +11892,7 @@ define void @s_shuffle_v3p0_v4p0__4_3_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s10, s14 ; GFX90A-NEXT: s_mov_b32 s11, s15 ; GFX90A-NEXT: s_mov_b32 s12, s14 @@ -11797,6 +11904,7 @@ define void @s_shuffle_v3p0_v4p0__4_3_3() { ; ; GFX942-LABEL: s_shuffle_v3p0_v4p0__4_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -12017,15 +12125,16 @@ define void @s_shuffle_v3p0_v4p0__7_u_3() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -12035,15 +12144,16 @@ define void @s_shuffle_v3p0_v4p0__7_u_3() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -12055,6 +12165,7 @@ define void @s_shuffle_v3p0_v4p0__7_u_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND @@ -12524,6 +12635,7 @@ define void @s_shuffle_v3p0_v4p0__1_4_4() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -12537,6 +12649,7 @@ define void @s_shuffle_v3p0_v4p0__1_4_4() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -12546,6 +12659,7 @@ define void @s_shuffle_v3p0_v4p0__1_4_4() { ; ; GFX942-LABEL: s_shuffle_v3p0_v4p0__1_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -12567,8 +12681,11 @@ define void @s_shuffle_v3p0_v4p0__2_4_4() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -12578,8 +12695,11 @@ define void @s_shuffle_v3p0_v4p0__2_4_4() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -12587,6 +12707,7 @@ define void @s_shuffle_v3p0_v4p0__2_4_4() { ; ; GFX942-LABEL: s_shuffle_v3p0_v4p0__2_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -12608,10 +12729,11 @@ define void @s_shuffle_v3p0_v4p0__3_4_4() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -12621,10 +12743,11 @@ define void @s_shuffle_v3p0_v4p0__3_4_4() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -12632,6 +12755,7 @@ define void @s_shuffle_v3p0_v4p0__3_4_4() { ; ; GFX942-LABEL: s_shuffle_v3p0_v4p0__3_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -12835,14 +12959,15 @@ define void @s_shuffle_v3p0_v4p0__7_4_4() { define void @s_shuffle_v3p0_v4p0__7_u_4() { ; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -12850,14 +12975,15 @@ define void @s_shuffle_v3p0_v4p0__7_u_4() { ; ; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -12865,6 +12991,7 @@ define void @s_shuffle_v3p0_v4p0__7_u_4() { ; ; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13594,12 +13721,13 @@ define void @s_shuffle_v3p0_v4p0__7_u_5() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -13609,12 +13737,13 @@ define void @s_shuffle_v3p0_v4p0__7_u_5() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -13622,6 +13751,7 @@ define void @s_shuffle_v3p0_v4p0__7_u_5() { ; ; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -14715,6 +14845,7 @@ define void @s_shuffle_v3p0_v4p0__u_7_7() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX900-NEXT: s_mov_b32 s10, s14 ; GFX900-NEXT: s_mov_b32 s11, s15 ; GFX900-NEXT: s_mov_b32 s12, s14 @@ -14730,6 +14861,7 @@ define void @s_shuffle_v3p0_v4p0__u_7_7() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX90A-NEXT: s_mov_b32 s10, s14 ; GFX90A-NEXT: s_mov_b32 s11, s15 ; GFX90A-NEXT: s_mov_b32 s12, s14 @@ -14741,6 +14873,7 @@ define void @s_shuffle_v3p0_v4p0__u_7_7() { ; ; GFX942-LABEL: s_shuffle_v3p0_v4p0__u_7_7: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -15161,12 +15294,13 @@ define void @s_shuffle_v3p0_v4p0__7_u_7() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[8:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s10 -; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:13] ; GFX900-NEXT: ;;#ASMEND @@ -15176,12 +15310,13 @@ define void @s_shuffle_v3p0_v4p0__7_u_7() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s10 -; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:13] ; GFX90A-NEXT: ;;#ASMEND @@ -15189,6 +15324,7 @@ define void @s_shuffle_v3p0_v4p0__7_u_7() { ; ; GFX942-LABEL: s_shuffle_v3p0_v4p0__7_u_7: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll index bd0100a4ffdb5..5ef6b0f8b057e 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll @@ -59,11 +59,12 @@ define void @v_shuffle_v3p3_v2p3__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -71,11 +72,12 @@ define void @v_shuffle_v3p3_v2p3__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -83,11 +85,12 @@ define void @v_shuffle_v3p3_v2p3__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx3 v2, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -112,11 +115,12 @@ define void @v_shuffle_v3p3_v2p3__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -124,11 +128,12 @@ define void @v_shuffle_v3p3_v2p3__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -136,11 +141,12 @@ define void @v_shuffle_v3p3_v2p3__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx3 v2, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -170,15 +176,16 @@ define void @v_shuffle_v3p3_v2p3__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -186,15 +193,16 @@ define void @v_shuffle_v3p3_v2p3__3_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -272,28 +280,30 @@ define void @v_shuffle_v3p3_v2p3__3_2_u(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_2_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_2_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -560,26 +570,29 @@ define void @v_shuffle_v3p3_v2p3__u_0_0(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__u_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__u_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -695,26 +708,29 @@ define void @v_shuffle_v3p3_v2p3__2_0_0(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__2_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__2_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -784,15 +800,16 @@ define void @v_shuffle_v3p3_v2p3__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -800,14 +817,16 @@ define void @v_shuffle_v3p3_v2p3__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -815,14 +834,16 @@ define void @v_shuffle_v3p3_v2p3__3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1192,15 +1213,16 @@ define void @v_shuffle_v3p3_v2p3__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1208,15 +1230,16 @@ define void @v_shuffle_v3p3_v2p3__3_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1394,11 +1417,12 @@ define void @v_shuffle_v3p3_v2p3__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[2:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1406,11 +1430,12 @@ define void @v_shuffle_v3p3_v2p3__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1418,11 +1443,12 @@ define void @v_shuffle_v3p3_v2p3__1_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx3 v2, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1492,39 +1518,44 @@ define void @v_shuffle_v3p3_v2p3__3_2_2(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3p3_v2p3__3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: global_store_dwordx3 v1, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v2p3__3_u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: global_store_dwordx3 v1, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1857,12 +1888,13 @@ define void @v_shuffle_v3p3_v2p3__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1870,12 +1902,13 @@ define void @v_shuffle_v3p3_v2p3__3_u_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[2:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2101,6 +2134,7 @@ define void @s_shuffle_v3p3_v2p3__0_u_u() { define void @s_shuffle_v3p3_v2p3__1_u_u() { ; GFX900-LABEL: s_shuffle_v3p3_v2p3__1_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -2113,6 +2147,7 @@ define void @s_shuffle_v3p3_v2p3__1_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v2p3__1_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -2125,6 +2160,7 @@ define void @s_shuffle_v3p3_v2p3__1_u_u() { ; ; GFX942-LABEL: s_shuffle_v3p3_v2p3__1_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -2157,6 +2193,7 @@ define void @s_shuffle_v3p3_v2p3__2_u_u() { define void @s_shuffle_v3p3_v2p3__3_u_u() { ; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -2169,6 +2206,7 @@ define void @s_shuffle_v3p3_v2p3__3_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -2181,6 +2219,7 @@ define void @s_shuffle_v3p3_v2p3__3_u_u() { ; ; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -2200,6 +2239,7 @@ define void @s_shuffle_v3p3_v2p3__3_u_u() { define void @s_shuffle_v3p3_v2p3__3_0_u() { ; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_0_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -2216,6 +2256,7 @@ define void @s_shuffle_v3p3_v2p3__3_0_u() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_0_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -2232,6 +2273,7 @@ define void @s_shuffle_v3p3_v2p3__3_0_u() { ; ; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_0_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -2307,6 +2349,7 @@ define void @s_shuffle_v3p3_v2p3__3_1_u() { define void @s_shuffle_v3p3_v2p3__3_2_u() { ; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_2_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -2320,6 +2363,7 @@ define void @s_shuffle_v3p3_v2p3__3_2_u() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_2_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -2333,6 +2377,7 @@ define void @s_shuffle_v3p3_v2p3__3_2_u() { ; ; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_2_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -2551,6 +2596,7 @@ define void @s_shuffle_v3p3_v2p3__3_3_3() { define void @s_shuffle_v3p3_v2p3__u_0_0() { ; GFX900-LABEL: s_shuffle_v3p3_v2p3__u_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -2564,6 +2610,7 @@ define void @s_shuffle_v3p3_v2p3__u_0_0() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v2p3__u_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -2577,6 +2624,7 @@ define void @s_shuffle_v3p3_v2p3__u_0_0() { ; ; GFX942-LABEL: s_shuffle_v3p3_v2p3__u_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -2663,6 +2711,7 @@ define void @s_shuffle_v3p3_v2p3__1_0_0() { define void @s_shuffle_v3p3_v2p3__2_0_0() { ; GFX900-LABEL: s_shuffle_v3p3_v2p3__2_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -2676,6 +2725,7 @@ define void @s_shuffle_v3p3_v2p3__2_0_0() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v2p3__2_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -2689,6 +2739,7 @@ define void @s_shuffle_v3p3_v2p3__2_0_0() { ; ; GFX942-LABEL: s_shuffle_v3p3_v2p3__2_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -2766,6 +2817,7 @@ define void @s_shuffle_v3p3_v2p3__3_0_0() { define void @s_shuffle_v3p3_v2p3__3_u_0() { ; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_u_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -2782,6 +2834,7 @@ define void @s_shuffle_v3p3_v2p3__3_u_0() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_u_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -2798,6 +2851,7 @@ define void @s_shuffle_v3p3_v2p3__3_u_0() { ; ; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -3065,6 +3119,7 @@ define void @s_shuffle_v3p3_v2p3__3_1_1() { define void @s_shuffle_v3p3_v2p3__3_u_1() { ; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_u_1: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -3081,6 +3136,7 @@ define void @s_shuffle_v3p3_v2p3__3_u_1() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_u_1: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -3097,6 +3153,7 @@ define void @s_shuffle_v3p3_v2p3__3_u_1() { ; ; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_u_1: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -3290,6 +3347,7 @@ define void @s_shuffle_v3p3_v2p3__0_2_2() { define void @s_shuffle_v3p3_v2p3__1_2_2() { ; GFX900-LABEL: s_shuffle_v3p3_v2p3__1_2_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -3302,6 +3360,7 @@ define void @s_shuffle_v3p3_v2p3__1_2_2() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v2p3__1_2_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -3314,6 +3373,7 @@ define void @s_shuffle_v3p3_v2p3__1_2_2() { ; ; GFX942-LABEL: s_shuffle_v3p3_v2p3__1_2_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -3395,6 +3455,7 @@ define void @s_shuffle_v3p3_v2p3__3_2_2() { define void @s_shuffle_v3p3_v2p3__3_u_2() { ; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -3408,6 +3469,7 @@ define void @s_shuffle_v3p3_v2p3__3_u_2() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -3421,6 +3483,7 @@ define void @s_shuffle_v3p3_v2p3__3_u_2() { ; ; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -3702,6 +3765,7 @@ define void @s_shuffle_v3p3_v2p3__2_3_3() { define void @s_shuffle_v3p3_v2p3__3_u_3() { ; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_u_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -3715,6 +3779,7 @@ define void @s_shuffle_v3p3_v2p3__3_u_3() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_u_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -3728,6 +3793,7 @@ define void @s_shuffle_v3p3_v2p3__3_u_3() { ; ; GFX942-LABEL: s_shuffle_v3p3_v2p3__3_u_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll index cecd2a0e4b015..8687438adcef3 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll @@ -61,9 +61,10 @@ define void @v_shuffle_v3p3_v3p3__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -73,9 +74,10 @@ define void @v_shuffle_v3p3_v3p3__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -85,9 +87,10 @@ define void @v_shuffle_v3p3_v3p3__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -99,37 +102,41 @@ define void @v_shuffle_v3p3_v3p3__1_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3p3_v3p3__2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__2_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -156,9 +163,10 @@ define void @v_shuffle_v3p3_v3p3__4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -168,9 +176,10 @@ define void @v_shuffle_v3p3_v3p3__4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -180,9 +189,10 @@ define void @v_shuffle_v3p3_v3p3__4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -195,37 +205,41 @@ define void @v_shuffle_v3p3_v3p3__4_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3p3_v3p3__5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -242,13 +256,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -258,13 +273,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -274,14 +290,14 @@ define void @v_shuffle_v3p3_v3p3__5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8 ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -347,49 +363,53 @@ define void @v_shuffle_v3p3_v3p3__5_1_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3p3_v3p3__5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_2_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx3 v9, v[6:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_2_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[8:10], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_2_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v6 +; GFX942-NEXT: v_mov_b32_e32 v9, v2 +; GFX942-NEXT: global_store_dwordx3 v3, v[8:10], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -402,39 +422,44 @@ define void @v_shuffle_v3p3_v3p3__5_2_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3p3_v3p3__5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -490,40 +515,44 @@ define void @v_shuffle_v3p3_v3p3__5_4_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3p3_v3p3__5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_5_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_5_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_5_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -859,10 +888,11 @@ define void @v_shuffle_v3p3_v3p3__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -872,10 +902,11 @@ define void @v_shuffle_v3p3_v3p3__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -885,10 +916,11 @@ define void @v_shuffle_v3p3_v3p3__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1043,10 +1075,11 @@ define void @v_shuffle_v3p3_v3p3__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1056,10 +1089,11 @@ define void @v_shuffle_v3p3_v3p3__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1069,10 +1103,11 @@ define void @v_shuffle_v3p3_v3p3__3_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1206,13 +1241,14 @@ define void @v_shuffle_v3p3_v3p3__5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1222,13 +1258,14 @@ define void @v_shuffle_v3p3_v3p3__5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1238,14 +1275,14 @@ define void @v_shuffle_v3p3_v3p3__5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8 ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1818,13 +1855,14 @@ define void @v_shuffle_v3p3_v3p3__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1834,13 +1872,14 @@ define void @v_shuffle_v3p3_v3p3__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[6:8], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1850,14 +1889,14 @@ define void @v_shuffle_v3p3_v3p3__5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8 ; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: global_store_dwordx3 v5, v[6:8], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2754,9 +2793,10 @@ define void @v_shuffle_v3p3_v3p3__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2766,9 +2806,10 @@ define void @v_shuffle_v3p3_v3p3__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2778,9 +2819,10 @@ define void @v_shuffle_v3p3_v3p3__1_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2792,37 +2834,41 @@ define void @v_shuffle_v3p3_v3p3__1_3_3(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3p3_v3p3__2_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__2_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__2_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__2_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2941,39 +2987,44 @@ define void @v_shuffle_v3p3_v3p3__5_3_3(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3p3_v3p3__5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_u_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_u_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_u_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v3, v[2:4], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3544,40 +3595,44 @@ define void @v_shuffle_v3p3_v3p3__5_4_4(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3p3_v3p3__5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_u_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:2] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v3p3__5_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4462,6 +4517,7 @@ define void @s_shuffle_v3p3_v3p3__0_u_u() { define void @s_shuffle_v3p3_v3p3__1_u_u() { ; GFX900-LABEL: s_shuffle_v3p3_v3p3__1_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -4474,6 +4530,7 @@ define void @s_shuffle_v3p3_v3p3__1_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v3p3__1_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -4486,6 +4543,7 @@ define void @s_shuffle_v3p3_v3p3__1_u_u() { ; ; GFX942-LABEL: s_shuffle_v3p3_v3p3__1_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -4504,6 +4562,7 @@ define void @s_shuffle_v3p3_v3p3__1_u_u() { define void @s_shuffle_v3p3_v3p3__2_u_u() { ; GFX900-LABEL: s_shuffle_v3p3_v3p3__2_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -4516,6 +4575,7 @@ define void @s_shuffle_v3p3_v3p3__2_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v3p3__2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -4528,6 +4588,7 @@ define void @s_shuffle_v3p3_v3p3__2_u_u() { ; ; GFX942-LABEL: s_shuffle_v3p3_v3p3__2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -4560,6 +4621,7 @@ define void @s_shuffle_v3p3_v3p3__3_u_u() { define void @s_shuffle_v3p3_v3p3__4_u_u() { ; GFX900-LABEL: s_shuffle_v3p3_v3p3__4_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -4572,6 +4634,7 @@ define void @s_shuffle_v3p3_v3p3__4_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v3p3__4_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -4584,6 +4647,7 @@ define void @s_shuffle_v3p3_v3p3__4_u_u() { ; ; GFX942-LABEL: s_shuffle_v3p3_v3p3__4_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -4603,6 +4667,7 @@ define void @s_shuffle_v3p3_v3p3__4_u_u() { define void @s_shuffle_v3p3_v3p3__5_u_u() { ; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -4615,6 +4680,7 @@ define void @s_shuffle_v3p3_v3p3__5_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -4627,6 +4693,7 @@ define void @s_shuffle_v3p3_v3p3__5_u_u() { ; ; GFX942-LABEL: s_shuffle_v3p3_v3p3__5_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -4646,14 +4713,15 @@ define void @s_shuffle_v3p3_v3p3__5_u_u() { define void @s_shuffle_v3p3_v3p3__5_0_u() { ; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_0_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s9, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] @@ -4662,14 +4730,15 @@ define void @s_shuffle_v3p3_v3p3__5_0_u() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_0_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s9, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] @@ -4678,6 +4747,7 @@ define void @s_shuffle_v3p3_v3p3__5_0_u() { ; ; GFX942-LABEL: s_shuffle_v3p3_v3p3__5_0_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -4753,14 +4823,15 @@ define void @s_shuffle_v3p3_v3p3__5_1_u() { define void @s_shuffle_v3p3_v3p3__5_2_u() { ; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_2_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s9, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] @@ -4769,14 +4840,15 @@ define void @s_shuffle_v3p3_v3p3__5_2_u() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_2_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s9, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] @@ -4785,6 +4857,7 @@ define void @s_shuffle_v3p3_v3p3__5_2_u() { ; ; GFX942-LABEL: s_shuffle_v3p3_v3p3__5_2_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -4808,6 +4881,7 @@ define void @s_shuffle_v3p3_v3p3__5_2_u() { define void @s_shuffle_v3p3_v3p3__5_3_u() { ; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -4821,6 +4895,7 @@ define void @s_shuffle_v3p3_v3p3__5_3_u() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -4834,6 +4909,7 @@ define void @s_shuffle_v3p3_v3p3__5_3_u() { ; ; GFX942-LABEL: s_shuffle_v3p3_v3p3__5_3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -4873,6 +4949,7 @@ define void @s_shuffle_v3p3_v3p3__5_4_u() { define void @s_shuffle_v3p3_v3p3__5_5_u() { ; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_5_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -4886,6 +4963,7 @@ define void @s_shuffle_v3p3_v3p3__5_5_u() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_5_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -4899,6 +4977,7 @@ define void @s_shuffle_v3p3_v3p3__5_5_u() { ; ; GFX942-LABEL: s_shuffle_v3p3_v3p3__5_5_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -5208,6 +5287,7 @@ define void @s_shuffle_v3p3_v3p3__5_5_5() { define void @s_shuffle_v3p3_v3p3__u_0_0() { ; GFX900-LABEL: s_shuffle_v3p3_v3p3__u_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -5221,6 +5301,7 @@ define void @s_shuffle_v3p3_v3p3__u_0_0() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v3p3__u_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -5234,6 +5315,7 @@ define void @s_shuffle_v3p3_v3p3__u_0_0() { ; ; GFX942-LABEL: s_shuffle_v3p3_v3p3__u_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -5368,6 +5450,7 @@ define void @s_shuffle_v3p3_v3p3__2_0_0() { define void @s_shuffle_v3p3_v3p3__3_0_0() { ; GFX900-LABEL: s_shuffle_v3p3_v3p3__3_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -5381,6 +5464,7 @@ define void @s_shuffle_v3p3_v3p3__3_0_0() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v3p3__3_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -5394,6 +5478,7 @@ define void @s_shuffle_v3p3_v3p3__3_0_0() { ; ; GFX942-LABEL: s_shuffle_v3p3_v3p3__3_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -5529,14 +5614,15 @@ define void @s_shuffle_v3p3_v3p3__5_0_0() { define void @s_shuffle_v3p3_v3p3__5_u_0() { ; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_u_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] @@ -5545,14 +5631,15 @@ define void @s_shuffle_v3p3_v3p3__5_u_0() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_u_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] @@ -5561,6 +5648,7 @@ define void @s_shuffle_v3p3_v3p3__5_u_0() { ; ; GFX942-LABEL: s_shuffle_v3p3_v3p3__5_u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -6015,14 +6103,15 @@ define void @s_shuffle_v3p3_v3p3__5_1_1() { define void @s_shuffle_v3p3_v3p3__5_u_1() { ; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_u_1: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s10, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:10] @@ -6031,14 +6120,15 @@ define void @s_shuffle_v3p3_v3p3__5_u_1() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_u_1: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:10] @@ -6047,6 +6137,7 @@ define void @s_shuffle_v3p3_v3p3__5_u_1() { ; ; GFX942-LABEL: s_shuffle_v3p3_v3p3__5_u_1: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -6827,6 +6918,7 @@ define void @s_shuffle_v3p3_v3p3__0_3_3() { define void @s_shuffle_v3p3_v3p3__1_3_3() { ; GFX900-LABEL: s_shuffle_v3p3_v3p3__1_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -6839,6 +6931,7 @@ define void @s_shuffle_v3p3_v3p3__1_3_3() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v3p3__1_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -6851,6 +6944,7 @@ define void @s_shuffle_v3p3_v3p3__1_3_3() { ; ; GFX942-LABEL: s_shuffle_v3p3_v3p3__1_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -6869,6 +6963,7 @@ define void @s_shuffle_v3p3_v3p3__1_3_3() { define void @s_shuffle_v3p3_v3p3__2_3_3() { ; GFX900-LABEL: s_shuffle_v3p3_v3p3__2_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -6881,6 +6976,7 @@ define void @s_shuffle_v3p3_v3p3__2_3_3() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v3p3__2_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -6893,6 +6989,7 @@ define void @s_shuffle_v3p3_v3p3__2_3_3() { ; ; GFX942-LABEL: s_shuffle_v3p3_v3p3__2_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7023,6 +7120,7 @@ define void @s_shuffle_v3p3_v3p3__5_3_3() { define void @s_shuffle_v3p3_v3p3__5_u_3() { ; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_u_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -7036,6 +7134,7 @@ define void @s_shuffle_v3p3_v3p3__5_u_3() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_u_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -7049,6 +7148,7 @@ define void @s_shuffle_v3p3_v3p3__5_u_3() { ; ; GFX942-LABEL: s_shuffle_v3p3_v3p3__5_u_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7532,6 +7632,7 @@ define void @s_shuffle_v3p3_v3p3__5_4_4() { define void @s_shuffle_v3p3_v3p3__5_u_4() { ; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_u_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -7545,6 +7646,7 @@ define void @s_shuffle_v3p3_v3p3__5_u_4() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -7558,6 +7660,7 @@ define void @s_shuffle_v3p3_v3p3__5_u_4() { ; ; GFX942-LABEL: s_shuffle_v3p3_v3p3__5_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll index 834f03f013ba1..734d7deca4f13 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll @@ -72,9 +72,10 @@ define void @v_shuffle_v3p3_v4p3__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -84,9 +85,10 @@ define void @v_shuffle_v3p3_v4p3__1_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -102,33 +104,37 @@ define void @v_shuffle_v3p3_v4p3__2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -144,6 +150,7 @@ define void @v_shuffle_v3p3_v4p3__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v3 ; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] @@ -156,6 +163,7 @@ define void @v_shuffle_v3p3_v4p3__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] @@ -168,6 +176,7 @@ define void @v_shuffle_v3p3_v4p3__3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] @@ -208,9 +217,10 @@ define void @v_shuffle_v3p3_v4p3__5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -220,9 +230,10 @@ define void @v_shuffle_v3p3_v4p3__5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -239,33 +250,37 @@ define void @v_shuffle_v3p3_v4p3__6_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__6_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__6_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -282,6 +297,7 @@ define void @v_shuffle_v3p3_v4p3__7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v3 ; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] @@ -294,6 +310,7 @@ define void @v_shuffle_v3p3_v4p3__7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] @@ -306,6 +323,7 @@ define void @v_shuffle_v3p3_v4p3__7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] @@ -328,6 +346,7 @@ define void @v_shuffle_v3p3_v4p3__7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v4 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 @@ -344,6 +363,7 @@ define void @v_shuffle_v3p3_v4p3__7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v5 ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 @@ -361,6 +381,7 @@ define void @v_shuffle_v3p3_v4p3__7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v5 ; GFX942-NEXT: v_mov_b32_e32 v3, v0 @@ -447,15 +468,16 @@ define void @v_shuffle_v3p3_v4p3__7_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -463,15 +485,16 @@ define void @v_shuffle_v3p3_v4p3__7_2_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v7 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -488,6 +511,7 @@ define void @v_shuffle_v3p3_v4p3__7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] @@ -504,6 +528,7 @@ define void @v_shuffle_v3p3_v4p3__7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] @@ -520,6 +545,7 @@ define void @v_shuffle_v3p3_v4p3__7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] @@ -539,40 +565,44 @@ define void @v_shuffle_v3p3_v4p3__7_3_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3p3_v4p3__7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_4_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_4_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -640,27 +670,30 @@ define void @v_shuffle_v3p3_v4p3__7_6_u(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_6_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_6_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -677,6 +710,7 @@ define void @v_shuffle_v3p3_v4p3__7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v3 ; GFX900-NEXT: v_mov_b32_e32 v1, v3 @@ -690,6 +724,7 @@ define void @v_shuffle_v3p3_v4p3__7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 @@ -703,6 +738,7 @@ define void @v_shuffle_v3p3_v4p3__7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 @@ -1141,10 +1177,11 @@ define void @v_shuffle_v3p3_v4p3__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1154,10 +1191,11 @@ define void @v_shuffle_v3p3_v4p3__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1167,10 +1205,11 @@ define void @v_shuffle_v3p3_v4p3__u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1372,10 +1411,11 @@ define void @v_shuffle_v3p3_v4p3__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1385,10 +1425,11 @@ define void @v_shuffle_v3p3_v4p3__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1398,10 +1439,11 @@ define void @v_shuffle_v3p3_v4p3__4_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1596,6 +1638,7 @@ define void @v_shuffle_v3p3_v4p3__7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v1, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v0 @@ -1612,6 +1655,7 @@ define void @v_shuffle_v3p3_v4p3__7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v5 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 @@ -1629,6 +1673,7 @@ define void @v_shuffle_v3p3_v4p3__7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v5 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 @@ -2425,10 +2470,11 @@ define void @v_shuffle_v3p3_v4p3__7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2441,10 +2487,11 @@ define void @v_shuffle_v3p3_v4p3__7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2458,10 +2505,11 @@ define void @v_shuffle_v3p3_v4p3__7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, v1 +; GFX942-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3645,6 +3693,7 @@ define void @v_shuffle_v3p3_v4p3__u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 @@ -3658,6 +3707,7 @@ define void @v_shuffle_v3p3_v4p3__u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 @@ -3874,6 +3924,7 @@ define void @v_shuffle_v3p3_v4p3__4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 @@ -3887,6 +3938,7 @@ define void @v_shuffle_v3p3_v4p3__4_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 @@ -4092,6 +4144,7 @@ define void @v_shuffle_v3p3_v4p3__7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] @@ -4108,6 +4161,7 @@ define void @v_shuffle_v3p3_v4p3__7_u_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] @@ -4529,9 +4583,10 @@ define void @v_shuffle_v3p3_v4p3__1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4541,9 +4596,10 @@ define void @v_shuffle_v3p3_v4p3__1_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4559,33 +4615,37 @@ define void @v_shuffle_v3p3_v4p3__2_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__2_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__2_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4601,6 +4661,7 @@ define void @v_shuffle_v3p3_v4p3__3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v3 ; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] @@ -4613,6 +4674,7 @@ define void @v_shuffle_v3p3_v4p3__3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] @@ -4625,6 +4687,7 @@ define void @v_shuffle_v3p3_v4p3__3_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] @@ -4794,40 +4857,44 @@ define void @v_shuffle_v3p3_v4p3__7_4_4(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3p3_v4p3__7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_u_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5608,40 +5675,44 @@ define void @v_shuffle_v3p3_v4p3__7_5_5(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v3p3_v4p3__7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_u_5: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_u_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v3p3_v4p3__7_u_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6800,6 +6871,7 @@ define void @v_shuffle_v3p3_v4p3__u_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 @@ -6813,6 +6885,7 @@ define void @v_shuffle_v3p3_v4p3__u_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 @@ -7213,6 +7286,7 @@ define void @v_shuffle_v3p3_v4p3__7_u_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: v_mov_b32_e32 v2, v3 @@ -7226,6 +7300,7 @@ define void @v_shuffle_v3p3_v4p3__7_u_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v3 ; GFX942-NEXT: v_mov_b32_e32 v2, v3 @@ -7666,6 +7741,7 @@ define void @s_shuffle_v3p3_v4p3__0_u_u() { define void @s_shuffle_v3p3_v4p3__1_u_u() { ; GFX900-LABEL: s_shuffle_v3p3_v4p3__1_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -7678,6 +7754,7 @@ define void @s_shuffle_v3p3_v4p3__1_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v4p3__1_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -7690,6 +7767,7 @@ define void @s_shuffle_v3p3_v4p3__1_u_u() { ; ; GFX942-LABEL: s_shuffle_v3p3_v4p3__1_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -7708,6 +7786,7 @@ define void @s_shuffle_v3p3_v4p3__1_u_u() { define void @s_shuffle_v3p3_v4p3__2_u_u() { ; GFX900-LABEL: s_shuffle_v3p3_v4p3__2_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -7720,6 +7799,7 @@ define void @s_shuffle_v3p3_v4p3__2_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v4p3__2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -7732,6 +7812,7 @@ define void @s_shuffle_v3p3_v4p3__2_u_u() { ; ; GFX942-LABEL: s_shuffle_v3p3_v4p3__2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -7750,6 +7831,7 @@ define void @s_shuffle_v3p3_v4p3__2_u_u() { define void @s_shuffle_v3p3_v4p3__3_u_u() { ; GFX900-LABEL: s_shuffle_v3p3_v4p3__3_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -7762,6 +7844,7 @@ define void @s_shuffle_v3p3_v4p3__3_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v4p3__3_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -7774,6 +7857,7 @@ define void @s_shuffle_v3p3_v4p3__3_u_u() { ; ; GFX942-LABEL: s_shuffle_v3p3_v4p3__3_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -7806,6 +7890,7 @@ define void @s_shuffle_v3p3_v4p3__4_u_u() { define void @s_shuffle_v3p3_v4p3__5_u_u() { ; GFX900-LABEL: s_shuffle_v3p3_v4p3__5_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -7818,6 +7903,7 @@ define void @s_shuffle_v3p3_v4p3__5_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v4p3__5_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -7830,6 +7916,7 @@ define void @s_shuffle_v3p3_v4p3__5_u_u() { ; ; GFX942-LABEL: s_shuffle_v3p3_v4p3__5_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -7849,6 +7936,7 @@ define void @s_shuffle_v3p3_v4p3__5_u_u() { define void @s_shuffle_v3p3_v4p3__6_u_u() { ; GFX900-LABEL: s_shuffle_v3p3_v4p3__6_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -7861,6 +7949,7 @@ define void @s_shuffle_v3p3_v4p3__6_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v4p3__6_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -7873,6 +7962,7 @@ define void @s_shuffle_v3p3_v4p3__6_u_u() { ; ; GFX942-LABEL: s_shuffle_v3p3_v4p3__6_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -7892,6 +7982,7 @@ define void @s_shuffle_v3p3_v4p3__6_u_u() { define void @s_shuffle_v3p3_v4p3__7_u_u() { ; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -7904,6 +7995,7 @@ define void @s_shuffle_v3p3_v4p3__7_u_u() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -7916,6 +8008,7 @@ define void @s_shuffle_v3p3_v4p3__7_u_u() { ; ; GFX942-LABEL: s_shuffle_v3p3_v4p3__7_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -7939,6 +8032,7 @@ define void @s_shuffle_v3p3_v4p3__7_0_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND @@ -7955,6 +8049,7 @@ define void @s_shuffle_v3p3_v4p3__7_0_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND @@ -7967,6 +8062,7 @@ define void @s_shuffle_v3p3_v4p3__7_0_u() { ; ; GFX942-LABEL: s_shuffle_v3p3_v4p3__7_0_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -8046,6 +8142,7 @@ define void @s_shuffle_v3p3_v4p3__7_2_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND @@ -8062,6 +8159,7 @@ define void @s_shuffle_v3p3_v4p3__7_2_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND @@ -8074,6 +8172,7 @@ define void @s_shuffle_v3p3_v4p3__7_2_u() { ; ; GFX942-LABEL: s_shuffle_v3p3_v4p3__7_2_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -8101,6 +8200,7 @@ define void @s_shuffle_v3p3_v4p3__7_3_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND @@ -8117,6 +8217,7 @@ define void @s_shuffle_v3p3_v4p3__7_3_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND @@ -8129,6 +8230,7 @@ define void @s_shuffle_v3p3_v4p3__7_3_u() { ; ; GFX942-LABEL: s_shuffle_v3p3_v4p3__7_3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -8152,6 +8254,7 @@ define void @s_shuffle_v3p3_v4p3__7_3_u() { define void @s_shuffle_v3p3_v4p3__7_4_u() { ; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_4_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -8165,6 +8268,7 @@ define void @s_shuffle_v3p3_v4p3__7_4_u() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_4_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -8178,6 +8282,7 @@ define void @s_shuffle_v3p3_v4p3__7_4_u() { ; ; GFX942-LABEL: s_shuffle_v3p3_v4p3__7_4_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -8217,6 +8322,7 @@ define void @s_shuffle_v3p3_v4p3__7_5_u() { define void @s_shuffle_v3p3_v4p3__7_6_u() { ; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_6_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -8230,6 +8336,7 @@ define void @s_shuffle_v3p3_v4p3__7_6_u() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_6_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -8243,6 +8350,7 @@ define void @s_shuffle_v3p3_v4p3__7_6_u() { ; ; GFX942-LABEL: s_shuffle_v3p3_v4p3__7_6_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -8263,6 +8371,7 @@ define void @s_shuffle_v3p3_v4p3__7_6_u() { define void @s_shuffle_v3p3_v4p3__7_7_u() { ; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_7_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -8276,6 +8385,7 @@ define void @s_shuffle_v3p3_v4p3__7_7_u() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_7_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -8289,6 +8399,7 @@ define void @s_shuffle_v3p3_v4p3__7_7_u() { ; ; GFX942-LABEL: s_shuffle_v3p3_v4p3__7_7_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -8705,6 +8816,7 @@ define void @s_shuffle_v3p3_v4p3__7_7_7() { define void @s_shuffle_v3p3_v4p3__u_0_0() { ; GFX900-LABEL: s_shuffle_v3p3_v4p3__u_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -8718,6 +8830,7 @@ define void @s_shuffle_v3p3_v4p3__u_0_0() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v4p3__u_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -8731,6 +8844,7 @@ define void @s_shuffle_v3p3_v4p3__u_0_0() { ; ; GFX942-LABEL: s_shuffle_v3p3_v4p3__u_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -8913,6 +9027,7 @@ define void @s_shuffle_v3p3_v4p3__3_0_0() { define void @s_shuffle_v3p3_v4p3__4_0_0() { ; GFX900-LABEL: s_shuffle_v3p3_v4p3__4_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -8926,6 +9041,7 @@ define void @s_shuffle_v3p3_v4p3__4_0_0() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v4p3__4_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -8939,6 +9055,7 @@ define void @s_shuffle_v3p3_v4p3__4_0_0() { ; ; GFX942-LABEL: s_shuffle_v3p3_v4p3__4_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -9136,6 +9253,7 @@ define void @s_shuffle_v3p3_v4p3__7_u_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND @@ -9152,6 +9270,7 @@ define void @s_shuffle_v3p3_v4p3__7_u_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND @@ -9164,6 +9283,7 @@ define void @s_shuffle_v3p3_v4p3__7_u_0() { ; ; GFX942-LABEL: s_shuffle_v3p3_v4p3__7_u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -9812,6 +9932,7 @@ define void @s_shuffle_v3p3_v4p3__7_u_1() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND @@ -9828,6 +9949,7 @@ define void @s_shuffle_v3p3_v4p3__7_u_1() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND @@ -9840,6 +9962,7 @@ define void @s_shuffle_v3p3_v4p3__7_u_1() { ; ; GFX942-LABEL: s_shuffle_v3p3_v4p3__7_u_1: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -10866,6 +10989,7 @@ define void @s_shuffle_v3p3_v4p3__7_6_2() { define void @s_shuffle_v3p3_v4p3__u_3_3() { ; GFX900-LABEL: s_shuffle_v3p3_v4p3__u_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -10879,6 +11003,7 @@ define void @s_shuffle_v3p3_v4p3__u_3_3() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v4p3__u_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -10892,6 +11017,7 @@ define void @s_shuffle_v3p3_v4p3__u_3_3() { ; ; GFX942-LABEL: s_shuffle_v3p3_v4p3__u_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -11074,6 +11200,7 @@ define void @s_shuffle_v3p3_v4p3__3_3_3() { define void @s_shuffle_v3p3_v4p3__4_3_3() { ; GFX900-LABEL: s_shuffle_v3p3_v4p3__4_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -11087,6 +11214,7 @@ define void @s_shuffle_v3p3_v4p3__4_3_3() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v4p3__4_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -11100,6 +11228,7 @@ define void @s_shuffle_v3p3_v4p3__4_3_3() { ; ; GFX942-LABEL: s_shuffle_v3p3_v4p3__4_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -11297,6 +11426,7 @@ define void @s_shuffle_v3p3_v4p3__7_u_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND @@ -11313,6 +11443,7 @@ define void @s_shuffle_v3p3_v4p3__7_u_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND @@ -11325,6 +11456,7 @@ define void @s_shuffle_v3p3_v4p3__7_u_3() { ; ; GFX942-LABEL: s_shuffle_v3p3_v4p3__7_u_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -11744,6 +11876,7 @@ define void @s_shuffle_v3p3_v4p3__0_4_4() { define void @s_shuffle_v3p3_v4p3__1_4_4() { ; GFX900-LABEL: s_shuffle_v3p3_v4p3__1_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -11756,6 +11889,7 @@ define void @s_shuffle_v3p3_v4p3__1_4_4() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v4p3__1_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -11768,6 +11902,7 @@ define void @s_shuffle_v3p3_v4p3__1_4_4() { ; ; GFX942-LABEL: s_shuffle_v3p3_v4p3__1_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -11786,6 +11921,7 @@ define void @s_shuffle_v3p3_v4p3__1_4_4() { define void @s_shuffle_v3p3_v4p3__2_4_4() { ; GFX900-LABEL: s_shuffle_v3p3_v4p3__2_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -11798,6 +11934,7 @@ define void @s_shuffle_v3p3_v4p3__2_4_4() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v4p3__2_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -11810,6 +11947,7 @@ define void @s_shuffle_v3p3_v4p3__2_4_4() { ; ; GFX942-LABEL: s_shuffle_v3p3_v4p3__2_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -11828,6 +11966,7 @@ define void @s_shuffle_v3p3_v4p3__2_4_4() { define void @s_shuffle_v3p3_v4p3__3_4_4() { ; GFX900-LABEL: s_shuffle_v3p3_v4p3__3_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -11840,6 +11979,7 @@ define void @s_shuffle_v3p3_v4p3__3_4_4() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v4p3__3_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -11852,6 +11992,7 @@ define void @s_shuffle_v3p3_v4p3__3_4_4() { ; ; GFX942-LABEL: s_shuffle_v3p3_v4p3__3_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12031,6 +12172,7 @@ define void @s_shuffle_v3p3_v4p3__7_4_4() { define void @s_shuffle_v3p3_v4p3__7_u_4() { ; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_u_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12044,6 +12186,7 @@ define void @s_shuffle_v3p3_v4p3__7_u_4() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12057,6 +12200,7 @@ define void @s_shuffle_v3p3_v4p3__7_u_4() { ; ; GFX942-LABEL: s_shuffle_v3p3_v4p3__7_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12722,6 +12866,7 @@ define void @s_shuffle_v3p3_v4p3__7_5_5() { define void @s_shuffle_v3p3_v4p3__7_u_5() { ; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_u_5: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12735,6 +12880,7 @@ define void @s_shuffle_v3p3_v4p3__7_u_5() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_u_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12748,6 +12894,7 @@ define void @s_shuffle_v3p3_v4p3__7_u_5() { ; ; GFX942-LABEL: s_shuffle_v3p3_v4p3__7_u_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -13720,6 +13867,7 @@ define void @s_shuffle_v3p3_v4p3__7_5_6() { define void @s_shuffle_v3p3_v4p3__u_7_7() { ; GFX900-LABEL: s_shuffle_v3p3_v4p3__u_7_7: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -13733,6 +13881,7 @@ define void @s_shuffle_v3p3_v4p3__u_7_7() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v4p3__u_7_7: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -13746,6 +13895,7 @@ define void @s_shuffle_v3p3_v4p3__u_7_7() { ; ; GFX942-LABEL: s_shuffle_v3p3_v4p3__u_7_7: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -14113,6 +14263,7 @@ define void @s_shuffle_v3p3_v4p3__6_7_7() { define void @s_shuffle_v3p3_v4p3__7_u_7() { ; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_u_7: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -14126,6 +14277,7 @@ define void @s_shuffle_v3p3_v4p3__7_u_7() { ; ; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_u_7: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -14139,6 +14291,7 @@ define void @s_shuffle_v3p3_v4p3__7_u_7() { ; ; GFX942-LABEL: s_shuffle_v3p3_v4p3__7_u_7: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v3bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v3bf16.ll index fa422e48bbce0..830d1a1c7fef8 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v3bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v3bf16.ll @@ -102,37 +102,41 @@ define void @v_shuffle_v4bf16_v3bf16__1_u_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4bf16_v3bf16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__2_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4bf16_v3bf16__2_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x bfloat> asm "; def $0", "=v"() @@ -202,37 +206,41 @@ define void @v_shuffle_v4bf16_v3bf16__4_u_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4bf16_v3bf16__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4bf16_v3bf16__5_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x bfloat> asm "; def $0", "=v"() @@ -4458,37 +4466,41 @@ define void @v_shuffle_v4bf16_v3bf16__1_3_3_3(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4bf16_v3bf16__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_3_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__2_3_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4bf16_v3bf16__2_3_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x bfloat> asm "; def $0", "=v"() @@ -7390,6 +7402,7 @@ define void @s_shuffle_v4bf16_v3bf16__1_u_u_u() { define void @s_shuffle_v4bf16_v3bf16__2_u_u_u() { ; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__2_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -7402,6 +7415,7 @@ define void @s_shuffle_v4bf16_v3bf16__2_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__2_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -7414,6 +7428,7 @@ define void @s_shuffle_v4bf16_v3bf16__2_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4bf16_v3bf16__2_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -7493,6 +7508,7 @@ define void @s_shuffle_v4bf16_v3bf16__4_u_u_u() { define void @s_shuffle_v4bf16_v3bf16__5_u_u_u() { ; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -7505,6 +7521,7 @@ define void @s_shuffle_v4bf16_v3bf16__5_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -7517,6 +7534,7 @@ define void @s_shuffle_v4bf16_v3bf16__5_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4bf16_v3bf16__5_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -11543,6 +11561,7 @@ define void @s_shuffle_v4bf16_v3bf16__1_3_3_3() { define void @s_shuffle_v4bf16_v3bf16__2_3_3_3() { ; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__2_3_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -11555,6 +11574,7 @@ define void @s_shuffle_v4bf16_v3bf16__2_3_3_3() { ; ; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__2_3_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -11567,6 +11587,7 @@ define void @s_shuffle_v4bf16_v3bf16__2_3_3_3() { ; ; GFX942-LABEL: s_shuffle_v4bf16_v3bf16__2_3_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v4bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v4bf16.ll index ab297c02fe3b5..a9427d66595e0 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v4bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v4bf16.ll @@ -99,37 +99,41 @@ define void @v_shuffle_v4bf16_v4bf16__1_u_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4bf16_v4bf16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__2_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__2_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4bf16_v4bf16__2_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x bfloat> asm "; def $0", "=v"() @@ -237,37 +241,41 @@ define void @v_shuffle_v4bf16_v4bf16__5_u_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4bf16_v4bf16__6_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__6_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__6_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4bf16_v4bf16__6_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x bfloat> asm "; def $0", "=v"() @@ -7087,37 +7095,41 @@ define void @v_shuffle_v4bf16_v4bf16__1_4_4_4(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4bf16_v4bf16__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__2_4_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__2_4_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4bf16_v4bf16__2_4_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x bfloat> asm "; def $0", "=v"() @@ -12198,6 +12210,7 @@ define void @s_shuffle_v4bf16_v4bf16__1_u_u_u() { define void @s_shuffle_v4bf16_v4bf16__2_u_u_u() { ; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__2_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -12210,6 +12223,7 @@ define void @s_shuffle_v4bf16_v4bf16__2_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__2_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -12222,6 +12236,7 @@ define void @s_shuffle_v4bf16_v4bf16__2_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4bf16_v4bf16__2_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -12339,6 +12354,7 @@ define void @s_shuffle_v4bf16_v4bf16__5_u_u_u() { define void @s_shuffle_v4bf16_v4bf16__6_u_u_u() { ; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__6_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -12351,6 +12367,7 @@ define void @s_shuffle_v4bf16_v4bf16__6_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__6_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -12363,6 +12380,7 @@ define void @s_shuffle_v4bf16_v4bf16__6_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4bf16_v4bf16__6_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -19217,6 +19235,7 @@ define void @s_shuffle_v4bf16_v4bf16__1_4_4_4() { define void @s_shuffle_v4bf16_v4bf16__2_4_4_4() { ; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__2_4_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -19229,6 +19248,7 @@ define void @s_shuffle_v4bf16_v4bf16__2_4_4_4() { ; ; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__2_4_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -19241,6 +19261,7 @@ define void @s_shuffle_v4bf16_v4bf16__2_4_4_4() { ; ; GFX942-LABEL: s_shuffle_v4bf16_v4bf16__2_4_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v3f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v3f16.ll index e91433ac4c1f7..826ae71ee386a 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v3f16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v3f16.ll @@ -102,37 +102,41 @@ define void @v_shuffle_v4f16_v3f16__1_u_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4f16_v3f16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v3f16__2_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f16_v3f16__2_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f16_v3f16__2_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x half> asm "; def $0", "=v"() @@ -202,37 +206,41 @@ define void @v_shuffle_v4f16_v3f16__4_u_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4f16_v3f16__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f16_v3f16__5_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x half> asm "; def $0", "=v"() @@ -4458,37 +4466,41 @@ define void @v_shuffle_v4f16_v3f16__1_3_3_3(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4f16_v3f16__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v3f16__2_3_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f16_v3f16__2_3_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f16_v3f16__2_3_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x half> asm "; def $0", "=v"() @@ -7390,6 +7402,7 @@ define void @s_shuffle_v4f16_v3f16__1_u_u_u() { define void @s_shuffle_v4f16_v3f16__2_u_u_u() { ; GFX900-LABEL: s_shuffle_v4f16_v3f16__2_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -7402,6 +7415,7 @@ define void @s_shuffle_v4f16_v3f16__2_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4f16_v3f16__2_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -7414,6 +7428,7 @@ define void @s_shuffle_v4f16_v3f16__2_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4f16_v3f16__2_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -7493,6 +7508,7 @@ define void @s_shuffle_v4f16_v3f16__4_u_u_u() { define void @s_shuffle_v4f16_v3f16__5_u_u_u() { ; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -7505,6 +7521,7 @@ define void @s_shuffle_v4f16_v3f16__5_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -7517,6 +7534,7 @@ define void @s_shuffle_v4f16_v3f16__5_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4f16_v3f16__5_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -11543,6 +11561,7 @@ define void @s_shuffle_v4f16_v3f16__1_3_3_3() { define void @s_shuffle_v4f16_v3f16__2_3_3_3() { ; GFX900-LABEL: s_shuffle_v4f16_v3f16__2_3_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -11555,6 +11574,7 @@ define void @s_shuffle_v4f16_v3f16__2_3_3_3() { ; ; GFX90A-LABEL: s_shuffle_v4f16_v3f16__2_3_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -11567,6 +11587,7 @@ define void @s_shuffle_v4f16_v3f16__2_3_3_3() { ; ; GFX942-LABEL: s_shuffle_v4f16_v3f16__2_3_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v4f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v4f16.ll index 47100b9983559..1805854ef7206 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v4f16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v4f16.ll @@ -99,37 +99,41 @@ define void @v_shuffle_v4f16_v4f16__1_u_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4f16_v4f16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v4f16__2_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f16_v4f16__2_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f16_v4f16__2_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x half> asm "; def $0", "=v"() @@ -237,37 +241,41 @@ define void @v_shuffle_v4f16_v4f16__5_u_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4f16_v4f16__6_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v4f16__6_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f16_v4f16__6_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f16_v4f16__6_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x half> asm "; def $0", "=v"() @@ -7087,37 +7095,41 @@ define void @v_shuffle_v4f16_v4f16__1_4_4_4(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4f16_v4f16__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f16_v4f16__2_4_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f16_v4f16__2_4_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f16_v4f16__2_4_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x half> asm "; def $0", "=v"() @@ -12198,6 +12210,7 @@ define void @s_shuffle_v4f16_v4f16__1_u_u_u() { define void @s_shuffle_v4f16_v4f16__2_u_u_u() { ; GFX900-LABEL: s_shuffle_v4f16_v4f16__2_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -12210,6 +12223,7 @@ define void @s_shuffle_v4f16_v4f16__2_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4f16_v4f16__2_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -12222,6 +12236,7 @@ define void @s_shuffle_v4f16_v4f16__2_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4f16_v4f16__2_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -12339,6 +12354,7 @@ define void @s_shuffle_v4f16_v4f16__5_u_u_u() { define void @s_shuffle_v4f16_v4f16__6_u_u_u() { ; GFX900-LABEL: s_shuffle_v4f16_v4f16__6_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -12351,6 +12367,7 @@ define void @s_shuffle_v4f16_v4f16__6_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4f16_v4f16__6_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -12363,6 +12380,7 @@ define void @s_shuffle_v4f16_v4f16__6_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4f16_v4f16__6_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -19217,6 +19235,7 @@ define void @s_shuffle_v4f16_v4f16__1_4_4_4() { define void @s_shuffle_v4f16_v4f16__2_4_4_4() { ; GFX900-LABEL: s_shuffle_v4f16_v4f16__2_4_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -19229,6 +19248,7 @@ define void @s_shuffle_v4f16_v4f16__2_4_4_4() { ; ; GFX90A-LABEL: s_shuffle_v4f16_v4f16__2_4_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -19241,6 +19261,7 @@ define void @s_shuffle_v4f16_v4f16__2_4_4_4() { ; ; GFX942-LABEL: s_shuffle_v4f16_v4f16__2_4_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll index df148f299a165..4a37da3ca5cb1 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll @@ -59,35 +59,39 @@ define void @v_shuffle_v4f32_v2f32__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v2f32__1_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v2f32__1_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -112,35 +116,39 @@ define void @v_shuffle_v4f32_v2f32__3_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v2f32__3_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -271,28 +279,30 @@ define void @v_shuffle_v4f32_v2f32__3_2_u_u(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v2f32__3_2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -2062,35 +2072,39 @@ define void @v_shuffle_v4f32_v2f32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v2f32__1_2_2_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v2f32__1_2_2_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -2365,43 +2379,47 @@ define void @v_shuffle_v4f32_v2f32__3_3_2_2(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4f32_v2f32__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v2f32__3_3_u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -3195,6 +3213,7 @@ define void @s_shuffle_v4f32_v2f32__0_u_u_u() { define void @s_shuffle_v4f32_v2f32__1_u_u_u() { ; GFX900-LABEL: s_shuffle_v4f32_v2f32__1_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -3207,6 +3226,7 @@ define void @s_shuffle_v4f32_v2f32__1_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v2f32__1_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -3219,6 +3239,7 @@ define void @s_shuffle_v4f32_v2f32__1_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v2f32__1_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -3251,6 +3272,7 @@ define void @s_shuffle_v4f32_v2f32__2_u_u_u() { define void @s_shuffle_v4f32_v2f32__3_u_u_u() { ; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -3263,6 +3285,7 @@ define void @s_shuffle_v4f32_v2f32__3_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -3275,6 +3298,7 @@ define void @s_shuffle_v4f32_v2f32__3_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v2f32__3_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -3294,6 +3318,7 @@ define void @s_shuffle_v4f32_v2f32__3_u_u_u() { define void @s_shuffle_v4f32_v2f32__3_0_u_u() { ; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_0_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -3310,6 +3335,7 @@ define void @s_shuffle_v4f32_v2f32__3_0_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_0_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -3326,6 +3352,7 @@ define void @s_shuffle_v4f32_v2f32__3_0_u_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v2f32__3_0_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -3401,6 +3428,7 @@ define void @s_shuffle_v4f32_v2f32__3_1_u_u() { define void @s_shuffle_v4f32_v2f32__3_2_u_u() { ; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_2_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -3414,6 +3442,7 @@ define void @s_shuffle_v4f32_v2f32__3_2_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -3427,6 +3456,7 @@ define void @s_shuffle_v4f32_v2f32__3_2_u_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v2f32__3_2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -4723,6 +4753,7 @@ define void @s_shuffle_v4f32_v2f32__0_2_2_2() { define void @s_shuffle_v4f32_v2f32__1_2_2_2() { ; GFX900-LABEL: s_shuffle_v4f32_v2f32__1_2_2_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -4735,6 +4766,7 @@ define void @s_shuffle_v4f32_v2f32__1_2_2_2() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v2f32__1_2_2_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -4747,6 +4779,7 @@ define void @s_shuffle_v4f32_v2f32__1_2_2_2() { ; ; GFX942-LABEL: s_shuffle_v4f32_v2f32__1_2_2_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -4922,6 +4955,7 @@ define void @s_shuffle_v4f32_v2f32__3_3_2_2() { define void @s_shuffle_v4f32_v2f32__3_3_u_2() { ; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_3_u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -4936,6 +4970,7 @@ define void @s_shuffle_v4f32_v2f32__3_3_u_2() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_3_u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -4950,6 +4985,7 @@ define void @s_shuffle_v4f32_v2f32__3_3_u_2() { ; ; GFX942-LABEL: s_shuffle_v4f32_v2f32__3_3_u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll index d4ee6fa20cad8..ed2998ca2fef0 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll @@ -59,35 +59,39 @@ define void @v_shuffle_v4f32_v3f32__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__1_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__1_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -101,11 +105,12 @@ define void @v_shuffle_v4f32_v3f32__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -113,11 +118,12 @@ define void @v_shuffle_v4f32_v3f32__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -125,11 +131,12 @@ define void @v_shuffle_v4f32_v3f32__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -154,35 +161,39 @@ define void @v_shuffle_v4f32_v3f32__4_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__4_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__4_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -197,11 +208,12 @@ define void @v_shuffle_v4f32_v3f32__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -209,11 +221,12 @@ define void @v_shuffle_v4f32_v3f32__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -221,11 +234,12 @@ define void @v_shuffle_v4f32_v3f32__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -255,15 +269,16 @@ define void @v_shuffle_v4f32_v3f32__5_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -271,15 +286,16 @@ define void @v_shuffle_v4f32_v3f32__5_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v9, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -347,15 +363,16 @@ define void @v_shuffle_v4f32_v3f32__5_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -363,15 +380,16 @@ define void @v_shuffle_v4f32_v3f32__5_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -379,15 +397,16 @@ define void @v_shuffle_v4f32_v3f32__5_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -412,28 +431,30 @@ define void @v_shuffle_v4f32_v3f32__5_3_u_u(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_3_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -491,12 +512,13 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -504,12 +526,13 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -517,12 +540,13 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -537,15 +561,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -553,15 +579,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -569,16 +597,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v9, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: v_mov_b32_e32 v1, v8 +; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -609,16 +638,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -626,17 +656,18 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -705,41 +736,47 @@ define void @v_shuffle_v4f32_v3f32__5_5_2_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4f32_v3f32__5_5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -765,29 +802,32 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_u(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_4_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1179,29 +1219,32 @@ define void @v_shuffle_v4f32_v3f32__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__u_0_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__u_0_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1374,29 +1417,32 @@ define void @v_shuffle_v4f32_v3f32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__3_0_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__3_0_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1529,16 +1575,17 @@ define void @v_shuffle_v4f32_v3f32__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: ; def v[5:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1546,16 +1593,17 @@ define void @v_shuffle_v4f32_v3f32__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1563,16 +1611,17 @@ define void @v_shuffle_v4f32_v3f32__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v9, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 ; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1884,16 +1933,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1901,16 +1951,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1918,16 +1969,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v9, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: v_mov_b32_e32 v1, v8 +; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2545,16 +2597,17 @@ define void @v_shuffle_v4f32_v3f32__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2562,17 +2615,18 @@ define void @v_shuffle_v4f32_v3f32__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -2876,16 +2930,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2893,15 +2948,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2909,16 +2966,18 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3268,29 +3327,31 @@ define void @v_shuffle_v4f32_v3f32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__1_2_2_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__1_2_2_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3416,8 +3477,9 @@ define void @v_shuffle_v4f32_v3f32__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND @@ -3433,6 +3495,7 @@ define void @v_shuffle_v4f32_v3f32__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART @@ -3871,16 +3934,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3888,16 +3952,17 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: v_mov_b32_e32 v1, v8 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -3986,14 +4051,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4004,14 +4070,15 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] @@ -4108,8 +4175,9 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND @@ -4126,6 +4194,7 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART @@ -4200,35 +4269,39 @@ define void @v_shuffle_v4f32_v3f32__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__1_3_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__1_3_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4242,11 +4315,12 @@ define void @v_shuffle_v4f32_v3f32__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4254,11 +4328,12 @@ define void @v_shuffle_v4f32_v3f32__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4266,11 +4341,12 @@ define void @v_shuffle_v4f32_v3f32__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4395,36 +4471,39 @@ define void @v_shuffle_v4f32_v3f32__5_3_3_3(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4f32_v3f32__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_u_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_u_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_u_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND @@ -4432,7 +4511,7 @@ define void @v_shuffle_v4f32_v3f32__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -4726,43 +4805,47 @@ define void @v_shuffle_v4f32_v3f32__5_5_3_3(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4f32_v3f32__5_5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_u_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_u_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_u_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5375,29 +5458,32 @@ define void @v_shuffle_v4f32_v3f32__5_u_4_4(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_u_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_u_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -5684,40 +5770,45 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6083,8 +6174,9 @@ define void @v_shuffle_v4f32_v3f32__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] @@ -6101,6 +6193,7 @@ define void @v_shuffle_v4f32_v3f32__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] @@ -6237,29 +6330,31 @@ define void @v_shuffle_v4f32_v3f32__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__4_5_5_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__4_5_5_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6605,13 +6700,14 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6619,13 +6715,14 @@ define void @v_shuffle_v4f32_v3f32__5_5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -6721,9 +6818,11 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6739,9 +6838,11 @@ define void @v_shuffle_v4f32_v3f32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -6877,6 +6978,7 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] @@ -6891,6 +6993,7 @@ define void @v_shuffle_v4f32_v3f32__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; ; GFX942-LABEL: v_shuffle_v4f32_v3f32__5_5_4_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] @@ -6966,6 +7069,7 @@ define void @s_shuffle_v4f32_v3f32__0_u_u_u() { define void @s_shuffle_v4f32_v3f32__1_u_u_u() { ; GFX900-LABEL: s_shuffle_v4f32_v3f32__1_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -6978,6 +7082,7 @@ define void @s_shuffle_v4f32_v3f32__1_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v3f32__1_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -6990,6 +7095,7 @@ define void @s_shuffle_v4f32_v3f32__1_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v3f32__1_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7008,6 +7114,7 @@ define void @s_shuffle_v4f32_v3f32__1_u_u_u() { define void @s_shuffle_v4f32_v3f32__2_u_u_u() { ; GFX900-LABEL: s_shuffle_v4f32_v3f32__2_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -7020,6 +7127,7 @@ define void @s_shuffle_v4f32_v3f32__2_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v3f32__2_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -7032,6 +7140,7 @@ define void @s_shuffle_v4f32_v3f32__2_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v3f32__2_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7064,6 +7173,7 @@ define void @s_shuffle_v4f32_v3f32__3_u_u_u() { define void @s_shuffle_v4f32_v3f32__4_u_u_u() { ; GFX900-LABEL: s_shuffle_v4f32_v3f32__4_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -7076,6 +7186,7 @@ define void @s_shuffle_v4f32_v3f32__4_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v3f32__4_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -7088,6 +7199,7 @@ define void @s_shuffle_v4f32_v3f32__4_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v3f32__4_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7107,6 +7219,7 @@ define void @s_shuffle_v4f32_v3f32__4_u_u_u() { define void @s_shuffle_v4f32_v3f32__5_u_u_u() { ; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -7119,6 +7232,7 @@ define void @s_shuffle_v4f32_v3f32__5_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -7131,6 +7245,7 @@ define void @s_shuffle_v4f32_v3f32__5_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v3f32__5_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7150,14 +7265,15 @@ define void @s_shuffle_v4f32_v3f32__5_u_u_u() { define void @s_shuffle_v4f32_v3f32__5_0_u_u() { ; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_0_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s9, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -7166,14 +7282,15 @@ define void @s_shuffle_v4f32_v3f32__5_0_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_0_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s9, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -7182,6 +7299,7 @@ define void @s_shuffle_v4f32_v3f32__5_0_u_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v3f32__5_0_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7257,14 +7375,15 @@ define void @s_shuffle_v4f32_v3f32__5_1_u_u() { define void @s_shuffle_v4f32_v3f32__5_2_u_u() { ; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_2_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s9, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -7273,14 +7392,15 @@ define void @s_shuffle_v4f32_v3f32__5_2_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s9, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -7289,6 +7409,7 @@ define void @s_shuffle_v4f32_v3f32__5_2_u_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v3f32__5_2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7312,6 +7433,7 @@ define void @s_shuffle_v4f32_v3f32__5_2_u_u() { define void @s_shuffle_v4f32_v3f32__5_3_u_u() { ; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_3_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -7325,6 +7447,7 @@ define void @s_shuffle_v4f32_v3f32__5_3_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_3_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -7338,6 +7461,7 @@ define void @s_shuffle_v4f32_v3f32__5_3_u_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v3f32__5_3_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7377,6 +7501,7 @@ define void @s_shuffle_v4f32_v3f32__5_4_u_u() { define void @s_shuffle_v4f32_v3f32__5_5_u_u() { ; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -7390,6 +7515,7 @@ define void @s_shuffle_v4f32_v3f32__5_5_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -7403,6 +7529,7 @@ define void @s_shuffle_v4f32_v3f32__5_5_u_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v3f32__5_5_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7423,15 +7550,16 @@ define void @s_shuffle_v4f32_v3f32__5_5_u_u() { define void @s_shuffle_v4f32_v3f32__5_5_0_u() { ; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_0_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s14 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -7440,15 +7568,16 @@ define void @s_shuffle_v4f32_v3f32__5_5_0_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_0_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s14 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -7457,6 +7586,7 @@ define void @s_shuffle_v4f32_v3f32__5_5_0_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v3f32__5_5_0_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7481,15 +7611,16 @@ define void @s_shuffle_v4f32_v3f32__5_5_0_u() { define void @s_shuffle_v4f32_v3f32__5_5_1_u() { ; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_1_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s14 ; GFX900-NEXT: s_mov_b32 s10, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -7498,15 +7629,16 @@ define void @s_shuffle_v4f32_v3f32__5_5_1_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_1_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s14 ; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -7515,6 +7647,7 @@ define void @s_shuffle_v4f32_v3f32__5_5_1_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v3f32__5_5_1_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7594,6 +7727,7 @@ define void @s_shuffle_v4f32_v3f32__5_5_2_u() { define void @s_shuffle_v4f32_v3f32__5_5_3_u() { ; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -7608,6 +7742,7 @@ define void @s_shuffle_v4f32_v3f32__5_5_3_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -7622,6 +7757,7 @@ define void @s_shuffle_v4f32_v3f32__5_5_3_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v3f32__5_5_3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7643,6 +7779,7 @@ define void @s_shuffle_v4f32_v3f32__5_5_3_u() { define void @s_shuffle_v4f32_v3f32__5_5_4_u() { ; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_4_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -7657,6 +7794,7 @@ define void @s_shuffle_v4f32_v3f32__5_5_4_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_4_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -7671,6 +7809,7 @@ define void @s_shuffle_v4f32_v3f32__5_5_4_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v3f32__5_5_4_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -8011,6 +8150,7 @@ define void @s_shuffle_v4f32_v3f32__5_5_5_5() { define void @s_shuffle_v4f32_v3f32__u_0_0_0() { ; GFX900-LABEL: s_shuffle_v4f32_v3f32__u_0_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -8025,6 +8165,7 @@ define void @s_shuffle_v4f32_v3f32__u_0_0_0() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v3f32__u_0_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -8039,6 +8180,7 @@ define void @s_shuffle_v4f32_v3f32__u_0_0_0() { ; ; GFX942-LABEL: s_shuffle_v4f32_v3f32__u_0_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -8181,6 +8323,7 @@ define void @s_shuffle_v4f32_v3f32__2_0_0_0() { define void @s_shuffle_v4f32_v3f32__3_0_0_0() { ; GFX900-LABEL: s_shuffle_v4f32_v3f32__3_0_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -8195,6 +8338,7 @@ define void @s_shuffle_v4f32_v3f32__3_0_0_0() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v3f32__3_0_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -8209,6 +8353,7 @@ define void @s_shuffle_v4f32_v3f32__3_0_0_0() { ; ; GFX942-LABEL: s_shuffle_v4f32_v3f32__3_0_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -8351,14 +8496,15 @@ define void @s_shuffle_v4f32_v3f32__5_0_0_0() { define void @s_shuffle_v4f32_v3f32__5_u_0_0() { ; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_u_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s4 ; GFX900-NEXT: ;;#ASMSTART @@ -8368,14 +8514,15 @@ define void @s_shuffle_v4f32_v3f32__5_u_0_0() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_u_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s4 ; GFX90A-NEXT: ;;#ASMSTART @@ -8385,6 +8532,7 @@ define void @s_shuffle_v4f32_v3f32__5_u_0_0() { ; ; GFX942-LABEL: s_shuffle_v4f32_v3f32__5_u_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -8711,15 +8859,16 @@ define void @s_shuffle_v4f32_v3f32__5_5_0_0() { define void @s_shuffle_v4f32_v3f32__5_5_u_0() { ; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_u_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s14 ; GFX900-NEXT: s_mov_b32 s11, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -8728,15 +8877,16 @@ define void @s_shuffle_v4f32_v3f32__5_5_u_0() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_u_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s14 ; GFX90A-NEXT: s_mov_b32 s11, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -8745,6 +8895,7 @@ define void @s_shuffle_v4f32_v3f32__5_5_u_0() { ; ; GFX942-LABEL: s_shuffle_v4f32_v3f32__5_5_u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -9226,14 +9377,15 @@ define void @s_shuffle_v4f32_v3f32__5_1_1_1() { define void @s_shuffle_v4f32_v3f32__5_u_1_1() { ; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_u_1_1: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s10, s5 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -9243,14 +9395,15 @@ define void @s_shuffle_v4f32_v3f32__5_u_1_1() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_u_1_1: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -9260,6 +9413,7 @@ define void @s_shuffle_v4f32_v3f32__5_u_1_1() { ; ; GFX942-LABEL: s_shuffle_v4f32_v3f32__5_u_1_1: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -9586,15 +9740,16 @@ define void @s_shuffle_v4f32_v3f32__5_5_1_1() { define void @s_shuffle_v4f32_v3f32__5_5_u_1() { ; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_u_1: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s14 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -9603,15 +9758,16 @@ define void @s_shuffle_v4f32_v3f32__5_5_u_1() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_u_1: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s14 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -9620,6 +9776,7 @@ define void @s_shuffle_v4f32_v3f32__5_5_u_1() { ; ; GFX942-LABEL: s_shuffle_v4f32_v3f32__5_5_u_1: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -10446,15 +10603,16 @@ define void @s_shuffle_v4f32_v3f32__5_5_2_2() { define void @s_shuffle_v4f32_v3f32__5_5_u_2() { ; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s14 ; GFX900-NEXT: s_mov_b32 s11, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -10463,15 +10621,16 @@ define void @s_shuffle_v4f32_v3f32__5_5_u_2() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s14 ; GFX90A-NEXT: s_mov_b32 s11, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -10480,6 +10639,7 @@ define void @s_shuffle_v4f32_v3f32__5_5_u_2() { ; ; GFX942-LABEL: s_shuffle_v4f32_v3f32__5_5_u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -10802,6 +10962,7 @@ define void @s_shuffle_v4f32_v3f32__0_3_3_3() { define void @s_shuffle_v4f32_v3f32__1_3_3_3() { ; GFX900-LABEL: s_shuffle_v4f32_v3f32__1_3_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -10814,6 +10975,7 @@ define void @s_shuffle_v4f32_v3f32__1_3_3_3() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v3f32__1_3_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -10826,6 +10988,7 @@ define void @s_shuffle_v4f32_v3f32__1_3_3_3() { ; ; GFX942-LABEL: s_shuffle_v4f32_v3f32__1_3_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -10844,6 +11007,7 @@ define void @s_shuffle_v4f32_v3f32__1_3_3_3() { define void @s_shuffle_v4f32_v3f32__2_3_3_3() { ; GFX900-LABEL: s_shuffle_v4f32_v3f32__2_3_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -10856,6 +11020,7 @@ define void @s_shuffle_v4f32_v3f32__2_3_3_3() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v3f32__2_3_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -10868,6 +11033,7 @@ define void @s_shuffle_v4f32_v3f32__2_3_3_3() { ; ; GFX942-LABEL: s_shuffle_v4f32_v3f32__2_3_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -11004,6 +11170,7 @@ define void @s_shuffle_v4f32_v3f32__5_3_3_3() { define void @s_shuffle_v4f32_v3f32__5_u_3_3() { ; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_u_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -11018,6 +11185,7 @@ define void @s_shuffle_v4f32_v3f32__5_u_3_3() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_u_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -11032,6 +11200,7 @@ define void @s_shuffle_v4f32_v3f32__5_u_3_3() { ; ; GFX942-LABEL: s_shuffle_v4f32_v3f32__5_u_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -11337,6 +11506,7 @@ define void @s_shuffle_v4f32_v3f32__5_5_3_3() { define void @s_shuffle_v4f32_v3f32__5_5_u_3() { ; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_u_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -11351,6 +11521,7 @@ define void @s_shuffle_v4f32_v3f32__5_5_u_3() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_u_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -11365,6 +11536,7 @@ define void @s_shuffle_v4f32_v3f32__5_5_u_3() { ; ; GFX942-LABEL: s_shuffle_v4f32_v3f32__5_5_u_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -11874,6 +12046,7 @@ define void @s_shuffle_v4f32_v3f32__5_4_4_4() { define void @s_shuffle_v4f32_v3f32__5_u_4_4() { ; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_u_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -11888,6 +12061,7 @@ define void @s_shuffle_v4f32_v3f32__5_u_4_4() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_u_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -11902,6 +12076,7 @@ define void @s_shuffle_v4f32_v3f32__5_u_4_4() { ; ; GFX942-LABEL: s_shuffle_v4f32_v3f32__5_u_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -12207,6 +12382,7 @@ define void @s_shuffle_v4f32_v3f32__5_5_4_4() { define void @s_shuffle_v4f32_v3f32__5_5_u_4() { ; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_u_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -12221,6 +12397,7 @@ define void @s_shuffle_v4f32_v3f32__5_5_u_4() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -12235,6 +12412,7 @@ define void @s_shuffle_v4f32_v3f32__5_5_u_4() { ; ; GFX942-LABEL: s_shuffle_v4f32_v3f32__5_5_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -12989,6 +13167,7 @@ define void @s_shuffle_v4f32_v3f32__5_4_5_5() { define void @s_shuffle_v4f32_v3f32__5_5_u_5() { ; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_u_5: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -13003,6 +13182,7 @@ define void @s_shuffle_v4f32_v3f32__5_5_u_5() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_u_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -13017,6 +13197,7 @@ define void @s_shuffle_v4f32_v3f32__5_5_u_5() { ; ; GFX942-LABEL: s_shuffle_v4f32_v3f32__5_5_u_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll index edc540edb3ad1..8242805658876 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll @@ -61,9 +61,10 @@ define void @v_shuffle_v4f32_v4f32__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -73,9 +74,10 @@ define void @v_shuffle_v4f32_v4f32__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -85,9 +87,10 @@ define void @v_shuffle_v4f32_v4f32__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -103,33 +106,37 @@ define void @v_shuffle_v4f32_v4f32__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__2_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__2_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -141,37 +148,41 @@ define void @v_shuffle_v4f32_v4f32__2_u_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4f32_v4f32__3_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__3_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__3_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__3_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -198,9 +209,10 @@ define void @v_shuffle_v4f32_v4f32__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -210,9 +222,10 @@ define void @v_shuffle_v4f32_v4f32__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -222,9 +235,10 @@ define void @v_shuffle_v4f32_v4f32__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -241,33 +255,37 @@ define void @v_shuffle_v4f32_v4f32__6_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__6_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__6_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -280,37 +298,41 @@ define void @v_shuffle_v4f32_v4f32__6_u_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4f32_v4f32__7_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -327,13 +349,14 @@ define void @v_shuffle_v4f32_v4f32__7_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7_vgpr8 +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v9, v[5:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -434,13 +457,14 @@ define void @v_shuffle_v4f32_v4f32__7_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ; implicit-def: $vgpr7_vgpr8_vgpr9_vgpr10 +; GFX900-NEXT: v_mov_b32_e32 v11, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v7, v6 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: global_store_dwordx4 v11, v[7:10], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -484,49 +508,53 @@ define void @v_shuffle_v4f32_v4f32__7_2_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4f32_v4f32__7_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_3_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v7 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_3_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v7 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_3_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v7 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -539,14 +567,15 @@ define void @v_shuffle_v4f32_v4f32__7_3_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4f32_v4f32__7_4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_4_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -626,14 +655,15 @@ define void @v_shuffle_v4f32_v4f32__7_5_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4f32_v4f32__7_6_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_6_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -670,40 +700,44 @@ define void @v_shuffle_v4f32_v4f32__7_6_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4f32_v4f32__7_7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -720,14 +754,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7_vgpr8 +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v0 +; GFX900-NEXT: global_store_dwordx4 v9, v[5:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -737,14 +772,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -754,15 +790,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -779,14 +815,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -796,14 +833,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -813,15 +851,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_1_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -890,52 +928,56 @@ define void @v_shuffle_v4f32_v4f32__7_7_2_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4f32_v4f32__7_7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v7 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v10, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v7 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v10, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v7 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: v_mov_b32_e32 v10, v3 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -948,41 +990,47 @@ define void @v_shuffle_v4f32_v4f32__7_7_3_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4f32_v4f32__7_7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_4_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_4_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -995,40 +1043,47 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4f32_v4f32__7_7_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_5_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_5_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1087,43 +1142,47 @@ define void @v_shuffle_v4f32_v4f32__7_7_6_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4f32_v4f32__7_7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_7_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_7_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_7_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1578,11 +1637,12 @@ define void @v_shuffle_v4f32_v4f32__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1592,11 +1652,12 @@ define void @v_shuffle_v4f32_v4f32__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1605,12 +1666,13 @@ define void @v_shuffle_v4f32_v4f32__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1820,11 +1882,12 @@ define void @v_shuffle_v4f32_v4f32__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1834,11 +1897,12 @@ define void @v_shuffle_v4f32_v4f32__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1848,11 +1912,12 @@ define void @v_shuffle_v4f32_v4f32__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2050,14 +2115,15 @@ define void @v_shuffle_v4f32_v4f32__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7_vgpr8 +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v0 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: global_store_dwordx4 v9, v[5:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2067,14 +2133,15 @@ define void @v_shuffle_v4f32_v4f32__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2084,15 +2151,15 @@ define void @v_shuffle_v4f32_v4f32__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2529,14 +2596,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7_vgpr8 +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: global_store_dwordx4 v9, v[5:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2546,14 +2614,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2563,15 +2632,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -3400,14 +3469,15 @@ define void @v_shuffle_v4f32_v4f32__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v1 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3417,14 +3487,15 @@ define void @v_shuffle_v4f32_v4f32__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3434,15 +3505,15 @@ define void @v_shuffle_v4f32_v4f32__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -3877,14 +3948,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3894,14 +3966,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3911,15 +3984,15 @@ define void @v_shuffle_v4f32_v4f32__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -5196,48 +5269,52 @@ define void @v_shuffle_v4f32_v4f32__7_7_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr7_vgpr8_vgpr9_vgpr10 +; GFX900-NEXT: v_mov_b32_e32 v11, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v7, v6 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v10, v2 +; GFX900-NEXT: global_store_dwordx4 v11, v[7:10], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v7 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v11, v2 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v7 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: v_mov_b32_e32 v11, v2 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -6974,9 +7051,10 @@ define void @v_shuffle_v4f32_v4f32__1_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6986,9 +7064,10 @@ define void @v_shuffle_v4f32_v4f32__1_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6998,9 +7077,10 @@ define void @v_shuffle_v4f32_v4f32__1_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7016,33 +7096,37 @@ define void @v_shuffle_v4f32_v4f32__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__2_4_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__2_4_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7054,37 +7138,41 @@ define void @v_shuffle_v4f32_v4f32__2_4_4_4(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4f32_v4f32__3_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__3_4_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__3_4_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__3_4_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7256,43 +7344,47 @@ define void @v_shuffle_v4f32_v4f32__7_4_4_4(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4f32_v4f32__7_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_u_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_u_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_u_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -7694,41 +7786,47 @@ define void @v_shuffle_v4f32_v4f32__7_7_4_4(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4f32_v4f32__7_7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_u_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -8548,43 +8646,47 @@ define void @v_shuffle_v4f32_v4f32__7_5_5_5(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4f32_v4f32__7_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_u_5_5: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_u_5_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_u_5_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -8980,40 +9082,47 @@ define void @v_shuffle_v4f32_v4f32__7_7_5_5(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4f32_v4f32__7_7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_u_5: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_u_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_u_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -10248,43 +10357,47 @@ define void @v_shuffle_v4f32_v4f32__7_7_6_6(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4f32_v4f32__7_7_u_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_u_6: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_u_6: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4f32_v4f32__7_7_u_6: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -11967,6 +12080,7 @@ define void @s_shuffle_v4f32_v4f32__0_u_u_u() { define void @s_shuffle_v4f32_v4f32__1_u_u_u() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__1_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -11979,6 +12093,7 @@ define void @s_shuffle_v4f32_v4f32__1_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__1_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -11991,6 +12106,7 @@ define void @s_shuffle_v4f32_v4f32__1_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__1_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12009,6 +12125,7 @@ define void @s_shuffle_v4f32_v4f32__1_u_u_u() { define void @s_shuffle_v4f32_v4f32__2_u_u_u() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__2_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12021,6 +12138,7 @@ define void @s_shuffle_v4f32_v4f32__2_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__2_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12033,6 +12151,7 @@ define void @s_shuffle_v4f32_v4f32__2_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__2_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12051,6 +12170,7 @@ define void @s_shuffle_v4f32_v4f32__2_u_u_u() { define void @s_shuffle_v4f32_v4f32__3_u_u_u() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__3_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12063,6 +12183,7 @@ define void @s_shuffle_v4f32_v4f32__3_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__3_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12075,6 +12196,7 @@ define void @s_shuffle_v4f32_v4f32__3_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__3_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12107,6 +12229,7 @@ define void @s_shuffle_v4f32_v4f32__4_u_u_u() { define void @s_shuffle_v4f32_v4f32__5_u_u_u() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__5_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12119,6 +12242,7 @@ define void @s_shuffle_v4f32_v4f32__5_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__5_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12131,6 +12255,7 @@ define void @s_shuffle_v4f32_v4f32__5_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__5_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12150,6 +12275,7 @@ define void @s_shuffle_v4f32_v4f32__5_u_u_u() { define void @s_shuffle_v4f32_v4f32__6_u_u_u() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__6_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12162,6 +12288,7 @@ define void @s_shuffle_v4f32_v4f32__6_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__6_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12174,6 +12301,7 @@ define void @s_shuffle_v4f32_v4f32__6_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__6_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12193,6 +12321,7 @@ define void @s_shuffle_v4f32_v4f32__6_u_u_u() { define void @s_shuffle_v4f32_v4f32__7_u_u_u() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12205,6 +12334,7 @@ define void @s_shuffle_v4f32_v4f32__7_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12217,6 +12347,7 @@ define void @s_shuffle_v4f32_v4f32__7_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__7_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12236,14 +12367,15 @@ define void @s_shuffle_v4f32_v4f32__7_u_u_u() { define void @s_shuffle_v4f32_v4f32__7_0_u_u() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_0_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 ; GFX900-NEXT: s_mov_b32 s9, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -12252,14 +12384,15 @@ define void @s_shuffle_v4f32_v4f32__7_0_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_0_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 ; GFX90A-NEXT: s_mov_b32 s9, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -12268,6 +12401,7 @@ define void @s_shuffle_v4f32_v4f32__7_0_u_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__7_0_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12343,14 +12477,15 @@ define void @s_shuffle_v4f32_v4f32__7_1_u_u() { define void @s_shuffle_v4f32_v4f32__7_2_u_u() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_2_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 ; GFX900-NEXT: s_mov_b32 s9, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -12359,14 +12494,15 @@ define void @s_shuffle_v4f32_v4f32__7_2_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 ; GFX90A-NEXT: s_mov_b32 s9, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -12375,6 +12511,7 @@ define void @s_shuffle_v4f32_v4f32__7_2_u_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__7_2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12398,14 +12535,15 @@ define void @s_shuffle_v4f32_v4f32__7_2_u_u() { define void @s_shuffle_v4f32_v4f32__7_3_u_u() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_3_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -12414,14 +12552,15 @@ define void @s_shuffle_v4f32_v4f32__7_3_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_3_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -12430,6 +12569,7 @@ define void @s_shuffle_v4f32_v4f32__7_3_u_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__7_3_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12453,6 +12593,7 @@ define void @s_shuffle_v4f32_v4f32__7_3_u_u() { define void @s_shuffle_v4f32_v4f32__7_4_u_u() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_4_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12466,6 +12607,7 @@ define void @s_shuffle_v4f32_v4f32__7_4_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_4_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12479,6 +12621,7 @@ define void @s_shuffle_v4f32_v4f32__7_4_u_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__7_4_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12518,6 +12661,7 @@ define void @s_shuffle_v4f32_v4f32__7_5_u_u() { define void @s_shuffle_v4f32_v4f32__7_6_u_u() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_6_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12531,6 +12675,7 @@ define void @s_shuffle_v4f32_v4f32__7_6_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_6_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12544,6 +12689,7 @@ define void @s_shuffle_v4f32_v4f32__7_6_u_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__7_6_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12564,6 +12710,7 @@ define void @s_shuffle_v4f32_v4f32__7_6_u_u() { define void @s_shuffle_v4f32_v4f32__7_7_u_u() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12577,6 +12724,7 @@ define void @s_shuffle_v4f32_v4f32__7_7_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12590,6 +12738,7 @@ define void @s_shuffle_v4f32_v4f32__7_7_u_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__7_7_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12610,15 +12759,16 @@ define void @s_shuffle_v4f32_v4f32__7_7_u_u() { define void @s_shuffle_v4f32_v4f32__7_7_0_u() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_0_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -12627,15 +12777,16 @@ define void @s_shuffle_v4f32_v4f32__7_7_0_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_0_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -12644,6 +12795,7 @@ define void @s_shuffle_v4f32_v4f32__7_7_0_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__7_7_0_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12668,15 +12820,16 @@ define void @s_shuffle_v4f32_v4f32__7_7_0_u() { define void @s_shuffle_v4f32_v4f32__7_7_1_u() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_1_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s10, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -12685,15 +12838,16 @@ define void @s_shuffle_v4f32_v4f32__7_7_1_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_1_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -12702,6 +12856,7 @@ define void @s_shuffle_v4f32_v4f32__7_7_1_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__7_7_1_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12781,15 +12936,16 @@ define void @s_shuffle_v4f32_v4f32__7_7_2_u() { define void @s_shuffle_v4f32_v4f32__7_7_3_u() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s10, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -12798,15 +12954,16 @@ define void @s_shuffle_v4f32_v4f32__7_7_3_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s10, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -12815,6 +12972,7 @@ define void @s_shuffle_v4f32_v4f32__7_7_3_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__7_7_3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12839,6 +12997,7 @@ define void @s_shuffle_v4f32_v4f32__7_7_3_u() { define void @s_shuffle_v4f32_v4f32__7_7_4_u() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_4_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12853,6 +13012,7 @@ define void @s_shuffle_v4f32_v4f32__7_7_4_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_4_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12867,6 +13027,7 @@ define void @s_shuffle_v4f32_v4f32__7_7_4_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__7_7_4_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12888,6 +13049,7 @@ define void @s_shuffle_v4f32_v4f32__7_7_4_u() { define void @s_shuffle_v4f32_v4f32__7_7_5_u() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_5_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12902,6 +13064,7 @@ define void @s_shuffle_v4f32_v4f32__7_7_5_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_5_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12916,6 +13079,7 @@ define void @s_shuffle_v4f32_v4f32__7_7_5_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__7_7_5_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12957,6 +13121,7 @@ define void @s_shuffle_v4f32_v4f32__7_7_6_u() { define void @s_shuffle_v4f32_v4f32__7_7_7_u() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_7_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12971,6 +13136,7 @@ define void @s_shuffle_v4f32_v4f32__7_7_7_u() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_7_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12985,6 +13151,7 @@ define void @s_shuffle_v4f32_v4f32__7_7_7_u() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__7_7_7_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -13424,6 +13591,7 @@ define void @s_shuffle_v4f32_v4f32__7_7_7_7() { define void @s_shuffle_v4f32_v4f32__u_0_0_0() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__u_0_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -13438,6 +13606,7 @@ define void @s_shuffle_v4f32_v4f32__u_0_0_0() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__u_0_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -13452,6 +13621,7 @@ define void @s_shuffle_v4f32_v4f32__u_0_0_0() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__u_0_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -13645,6 +13815,7 @@ define void @s_shuffle_v4f32_v4f32__3_0_0_0() { define void @s_shuffle_v4f32_v4f32__4_0_0_0() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__4_0_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -13659,6 +13830,7 @@ define void @s_shuffle_v4f32_v4f32__4_0_0_0() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__4_0_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -13673,6 +13845,7 @@ define void @s_shuffle_v4f32_v4f32__4_0_0_0() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__4_0_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -13876,14 +14049,15 @@ define void @s_shuffle_v4f32_v4f32__7_0_0_0() { define void @s_shuffle_v4f32_v4f32__7_u_0_0() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_u_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s4 ; GFX900-NEXT: ;;#ASMSTART @@ -13893,14 +14067,15 @@ define void @s_shuffle_v4f32_v4f32__7_u_0_0() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_u_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s4 ; GFX90A-NEXT: ;;#ASMSTART @@ -13910,6 +14085,7 @@ define void @s_shuffle_v4f32_v4f32__7_u_0_0() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__7_u_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -14358,15 +14534,16 @@ define void @s_shuffle_v4f32_v4f32__7_7_0_0() { define void @s_shuffle_v4f32_v4f32__7_7_u_0() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_u_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s11, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -14375,15 +14552,16 @@ define void @s_shuffle_v4f32_v4f32__7_7_u_0() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_u_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s11, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -14392,6 +14570,7 @@ define void @s_shuffle_v4f32_v4f32__7_7_u_0() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__7_7_u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -15070,14 +15249,15 @@ define void @s_shuffle_v4f32_v4f32__7_1_1_1() { define void @s_shuffle_v4f32_v4f32__7_u_1_1() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_u_1_1: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 ; GFX900-NEXT: s_mov_b32 s10, s5 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -15087,14 +15267,15 @@ define void @s_shuffle_v4f32_v4f32__7_u_1_1() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_u_1_1: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 ; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -15104,6 +15285,7 @@ define void @s_shuffle_v4f32_v4f32__7_u_1_1() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__7_u_1_1: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -15552,15 +15734,16 @@ define void @s_shuffle_v4f32_v4f32__7_7_1_1() { define void @s_shuffle_v4f32_v4f32__7_7_u_1() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_u_1: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -15569,15 +15752,16 @@ define void @s_shuffle_v4f32_v4f32__7_7_u_1() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_u_1: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -15586,6 +15770,7 @@ define void @s_shuffle_v4f32_v4f32__7_7_u_1() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__7_7_u_1: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -16725,15 +16910,16 @@ define void @s_shuffle_v4f32_v4f32__7_7_2_2() { define void @s_shuffle_v4f32_v4f32__7_7_u_2() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s11, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -16742,15 +16928,16 @@ define void @s_shuffle_v4f32_v4f32__7_7_u_2() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s11, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -16759,6 +16946,7 @@ define void @s_shuffle_v4f32_v4f32__7_7_u_2() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__7_7_u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -18358,6 +18546,7 @@ define void @s_shuffle_v4f32_v4f32__0_4_4_4() { define void @s_shuffle_v4f32_v4f32__1_4_4_4() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__1_4_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -18370,6 +18559,7 @@ define void @s_shuffle_v4f32_v4f32__1_4_4_4() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__1_4_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -18382,6 +18572,7 @@ define void @s_shuffle_v4f32_v4f32__1_4_4_4() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__1_4_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -18400,6 +18591,7 @@ define void @s_shuffle_v4f32_v4f32__1_4_4_4() { define void @s_shuffle_v4f32_v4f32__2_4_4_4() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__2_4_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -18412,6 +18604,7 @@ define void @s_shuffle_v4f32_v4f32__2_4_4_4() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__2_4_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -18424,6 +18617,7 @@ define void @s_shuffle_v4f32_v4f32__2_4_4_4() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__2_4_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -18442,6 +18636,7 @@ define void @s_shuffle_v4f32_v4f32__2_4_4_4() { define void @s_shuffle_v4f32_v4f32__3_4_4_4() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__3_4_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -18454,6 +18649,7 @@ define void @s_shuffle_v4f32_v4f32__3_4_4_4() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__3_4_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -18466,6 +18662,7 @@ define void @s_shuffle_v4f32_v4f32__3_4_4_4() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__3_4_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -18654,6 +18851,7 @@ define void @s_shuffle_v4f32_v4f32__7_4_4_4() { define void @s_shuffle_v4f32_v4f32__7_u_4_4() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_u_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -18668,6 +18866,7 @@ define void @s_shuffle_v4f32_v4f32__7_u_4_4() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_u_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -18682,6 +18881,7 @@ define void @s_shuffle_v4f32_v4f32__7_u_4_4() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__7_u_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -19100,6 +19300,7 @@ define void @s_shuffle_v4f32_v4f32__7_7_4_4() { define void @s_shuffle_v4f32_v4f32__7_7_u_4() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_u_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -19114,6 +19315,7 @@ define void @s_shuffle_v4f32_v4f32__7_7_u_4() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -19128,6 +19330,7 @@ define void @s_shuffle_v4f32_v4f32__7_7_u_4() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__7_7_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -19829,6 +20032,7 @@ define void @s_shuffle_v4f32_v4f32__7_5_5_5() { define void @s_shuffle_v4f32_v4f32__7_u_5_5() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_u_5_5: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -19843,6 +20047,7 @@ define void @s_shuffle_v4f32_v4f32__7_u_5_5() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_u_5_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -19857,6 +20062,7 @@ define void @s_shuffle_v4f32_v4f32__7_u_5_5() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__7_u_5_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -20275,6 +20481,7 @@ define void @s_shuffle_v4f32_v4f32__7_7_5_5() { define void @s_shuffle_v4f32_v4f32__7_7_u_5() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_u_5: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -20289,6 +20496,7 @@ define void @s_shuffle_v4f32_v4f32__7_7_u_5() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_u_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -20303,6 +20511,7 @@ define void @s_shuffle_v4f32_v4f32__7_7_u_5() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__7_7_u_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -21349,6 +21558,7 @@ define void @s_shuffle_v4f32_v4f32__7_7_6_6() { define void @s_shuffle_v4f32_v4f32__7_7_u_6() { ; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_u_6: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -21363,6 +21573,7 @@ define void @s_shuffle_v4f32_v4f32__7_7_u_6() { ; ; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_u_6: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -21377,6 +21588,7 @@ define void @s_shuffle_v4f32_v4f32__7_7_u_6() { ; ; GFX942-LABEL: s_shuffle_v4f32_v4f32__7_7_u_6: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v3i16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v3i16.ll index 7b3a5a879f44f..8336b63c1088b 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v3i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v3i16.ll @@ -102,37 +102,41 @@ define void @v_shuffle_v4i16_v3i16__1_u_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i16_v3i16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v3i16__2_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i16_v3i16__2_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i16_v3i16__2_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i16> asm "; def $0", "=v"() @@ -202,37 +206,41 @@ define void @v_shuffle_v4i16_v3i16__4_u_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i16_v3i16__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i16_v3i16__5_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i16> asm "; def $0", "=v"() @@ -4458,37 +4466,41 @@ define void @v_shuffle_v4i16_v3i16__1_3_3_3(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i16_v3i16__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v3i16__2_3_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i16_v3i16__2_3_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i16_v3i16__2_3_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i16> asm "; def $0", "=v"() @@ -7390,6 +7402,7 @@ define void @s_shuffle_v4i16_v3i16__1_u_u_u() { define void @s_shuffle_v4i16_v3i16__2_u_u_u() { ; GFX900-LABEL: s_shuffle_v4i16_v3i16__2_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -7402,6 +7415,7 @@ define void @s_shuffle_v4i16_v3i16__2_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i16_v3i16__2_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -7414,6 +7428,7 @@ define void @s_shuffle_v4i16_v3i16__2_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i16_v3i16__2_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -7493,6 +7508,7 @@ define void @s_shuffle_v4i16_v3i16__4_u_u_u() { define void @s_shuffle_v4i16_v3i16__5_u_u_u() { ; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -7505,6 +7521,7 @@ define void @s_shuffle_v4i16_v3i16__5_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -7517,6 +7534,7 @@ define void @s_shuffle_v4i16_v3i16__5_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i16_v3i16__5_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -11390,6 +11408,7 @@ define void @s_shuffle_v4i16_v3i16__1_3_3_3() { define void @s_shuffle_v4i16_v3i16__2_3_3_3() { ; GFX900-LABEL: s_shuffle_v4i16_v3i16__2_3_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -11402,6 +11421,7 @@ define void @s_shuffle_v4i16_v3i16__2_3_3_3() { ; ; GFX90A-LABEL: s_shuffle_v4i16_v3i16__2_3_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -11414,6 +11434,7 @@ define void @s_shuffle_v4i16_v3i16__2_3_3_3() { ; ; GFX942-LABEL: s_shuffle_v4i16_v3i16__2_3_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v4i16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v4i16.ll index 2a371b7c7d2d3..953603b990d4b 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v4i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v4i16.ll @@ -99,37 +99,41 @@ define void @v_shuffle_v4i16_v4i16__1_u_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i16_v4i16__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v4i16__2_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i16_v4i16__2_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i16_v4i16__2_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i16> asm "; def $0", "=v"() @@ -237,37 +241,41 @@ define void @v_shuffle_v4i16_v4i16__5_u_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i16_v4i16__6_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v4i16__6_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i16_v4i16__6_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i16_v4i16__6_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i16> asm "; def $0", "=v"() @@ -7087,37 +7095,41 @@ define void @v_shuffle_v4i16_v4i16__1_4_4_4(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i16_v4i16__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i16_v4i16__2_4_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:1] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i16_v4i16__2_4_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i16_v4i16__2_4_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:1] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i16> asm "; def $0", "=v"() @@ -12198,6 +12210,7 @@ define void @s_shuffle_v4i16_v4i16__1_u_u_u() { define void @s_shuffle_v4i16_v4i16__2_u_u_u() { ; GFX900-LABEL: s_shuffle_v4i16_v4i16__2_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -12210,6 +12223,7 @@ define void @s_shuffle_v4i16_v4i16__2_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i16_v4i16__2_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -12222,6 +12236,7 @@ define void @s_shuffle_v4i16_v4i16__2_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i16_v4i16__2_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -12339,6 +12354,7 @@ define void @s_shuffle_v4i16_v4i16__5_u_u_u() { define void @s_shuffle_v4i16_v4i16__6_u_u_u() { ; GFX900-LABEL: s_shuffle_v4i16_v4i16__6_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -12351,6 +12367,7 @@ define void @s_shuffle_v4i16_v4i16__6_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i16_v4i16__6_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -12363,6 +12380,7 @@ define void @s_shuffle_v4i16_v4i16__6_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i16_v4i16__6_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -18740,6 +18758,7 @@ define void @s_shuffle_v4i16_v4i16__1_4_4_4() { define void @s_shuffle_v4i16_v4i16__2_4_4_4() { ; GFX900-LABEL: s_shuffle_v4i16_v4i16__2_4_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -18752,6 +18771,7 @@ define void @s_shuffle_v4i16_v4i16__2_4_4_4() { ; ; GFX90A-LABEL: s_shuffle_v4i16_v4i16__2_4_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -18764,6 +18784,7 @@ define void @s_shuffle_v4i16_v4i16__2_4_4_4() { ; ; GFX942-LABEL: s_shuffle_v4i16_v4i16__2_4_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll index 9d3affa6da266..a297452d7ca47 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll @@ -59,35 +59,39 @@ define void @v_shuffle_v4i32_v2i32__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v2i32__1_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v2i32__1_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -112,35 +116,39 @@ define void @v_shuffle_v4i32_v2i32__3_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v2i32__3_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -271,28 +279,30 @@ define void @v_shuffle_v4i32_v2i32__3_2_u_u(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v2i32__3_2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -2068,35 +2078,39 @@ define void @v_shuffle_v4i32_v2i32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v2i32__1_2_2_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v2i32__1_2_2_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -2371,43 +2385,47 @@ define void @v_shuffle_v4i32_v2i32__3_3_2_2(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i32_v2i32__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v2i32__3_3_u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -3201,6 +3219,7 @@ define void @s_shuffle_v4i32_v2i32__0_u_u_u() { define void @s_shuffle_v4i32_v2i32__1_u_u_u() { ; GFX900-LABEL: s_shuffle_v4i32_v2i32__1_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -3213,6 +3232,7 @@ define void @s_shuffle_v4i32_v2i32__1_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v2i32__1_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -3225,6 +3245,7 @@ define void @s_shuffle_v4i32_v2i32__1_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v2i32__1_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -3257,6 +3278,7 @@ define void @s_shuffle_v4i32_v2i32__2_u_u_u() { define void @s_shuffle_v4i32_v2i32__3_u_u_u() { ; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -3269,6 +3291,7 @@ define void @s_shuffle_v4i32_v2i32__3_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -3281,6 +3304,7 @@ define void @s_shuffle_v4i32_v2i32__3_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v2i32__3_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -3300,6 +3324,7 @@ define void @s_shuffle_v4i32_v2i32__3_u_u_u() { define void @s_shuffle_v4i32_v2i32__3_0_u_u() { ; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_0_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -3316,6 +3341,7 @@ define void @s_shuffle_v4i32_v2i32__3_0_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_0_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -3332,6 +3358,7 @@ define void @s_shuffle_v4i32_v2i32__3_0_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v2i32__3_0_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -3407,6 +3434,7 @@ define void @s_shuffle_v4i32_v2i32__3_1_u_u() { define void @s_shuffle_v4i32_v2i32__3_2_u_u() { ; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_2_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -3420,6 +3448,7 @@ define void @s_shuffle_v4i32_v2i32__3_2_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -3433,6 +3462,7 @@ define void @s_shuffle_v4i32_v2i32__3_2_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v2i32__3_2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -4731,6 +4761,7 @@ define void @s_shuffle_v4i32_v2i32__0_2_2_2() { define void @s_shuffle_v4i32_v2i32__1_2_2_2() { ; GFX900-LABEL: s_shuffle_v4i32_v2i32__1_2_2_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -4743,6 +4774,7 @@ define void @s_shuffle_v4i32_v2i32__1_2_2_2() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v2i32__1_2_2_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -4755,6 +4787,7 @@ define void @s_shuffle_v4i32_v2i32__1_2_2_2() { ; ; GFX942-LABEL: s_shuffle_v4i32_v2i32__1_2_2_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -4930,6 +4963,7 @@ define void @s_shuffle_v4i32_v2i32__3_3_2_2() { define void @s_shuffle_v4i32_v2i32__3_3_u_2() { ; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_3_u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -4944,6 +4978,7 @@ define void @s_shuffle_v4i32_v2i32__3_3_u_2() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_3_u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -4958,6 +4993,7 @@ define void @s_shuffle_v4i32_v2i32__3_3_u_2() { ; ; GFX942-LABEL: s_shuffle_v4i32_v2i32__3_3_u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll index 1a669adf2b635..8ee15c1a9c3f0 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll @@ -59,35 +59,39 @@ define void @v_shuffle_v4i32_v3i32__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__1_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__1_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -101,11 +105,12 @@ define void @v_shuffle_v4i32_v3i32__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -113,11 +118,12 @@ define void @v_shuffle_v4i32_v3i32__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -125,11 +131,12 @@ define void @v_shuffle_v4i32_v3i32__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -154,35 +161,39 @@ define void @v_shuffle_v4i32_v3i32__4_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__4_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__4_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -197,11 +208,12 @@ define void @v_shuffle_v4i32_v3i32__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -209,11 +221,12 @@ define void @v_shuffle_v4i32_v3i32__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -221,11 +234,12 @@ define void @v_shuffle_v4i32_v3i32__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -255,15 +269,16 @@ define void @v_shuffle_v4i32_v3i32__5_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -271,15 +286,16 @@ define void @v_shuffle_v4i32_v3i32__5_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v9, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -347,15 +363,16 @@ define void @v_shuffle_v4i32_v3i32__5_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -363,15 +380,16 @@ define void @v_shuffle_v4i32_v3i32__5_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -379,15 +397,16 @@ define void @v_shuffle_v4i32_v3i32__5_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -412,28 +431,30 @@ define void @v_shuffle_v4i32_v3i32__5_3_u_u(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_3_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -491,12 +512,13 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -504,12 +526,13 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -517,12 +540,13 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -537,15 +561,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -553,15 +579,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -569,16 +597,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v9, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: v_mov_b32_e32 v1, v8 +; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -609,16 +638,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -626,17 +656,18 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -705,41 +736,47 @@ define void @v_shuffle_v4i32_v3i32__5_5_2_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i32_v3i32__5_5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -765,29 +802,32 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_u(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_4_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1179,29 +1219,32 @@ define void @v_shuffle_v4i32_v3i32__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__u_0_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__u_0_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1374,29 +1417,32 @@ define void @v_shuffle_v4i32_v3i32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__3_0_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__3_0_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1529,16 +1575,17 @@ define void @v_shuffle_v4i32_v3i32__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: ; def v[5:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1546,16 +1593,17 @@ define void @v_shuffle_v4i32_v3i32__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1563,16 +1611,17 @@ define void @v_shuffle_v4i32_v3i32__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v9, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 ; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1884,16 +1933,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1901,16 +1951,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1918,16 +1969,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v9, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: v_mov_b32_e32 v1, v8 +; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2545,16 +2597,17 @@ define void @v_shuffle_v4i32_v3i32__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2562,17 +2615,18 @@ define void @v_shuffle_v4i32_v3i32__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -2876,16 +2930,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2893,15 +2948,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2909,16 +2966,18 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3268,29 +3327,31 @@ define void @v_shuffle_v4i32_v3i32__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__1_2_2_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__1_2_2_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3416,8 +3477,9 @@ define void @v_shuffle_v4i32_v3i32__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND @@ -3433,6 +3495,7 @@ define void @v_shuffle_v4i32_v3i32__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART @@ -3871,16 +3934,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3888,16 +3952,17 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: v_mov_b32_e32 v1, v8 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -3986,14 +4051,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4004,14 +4070,15 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] @@ -4108,8 +4175,9 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND @@ -4126,6 +4194,7 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART @@ -4200,35 +4269,39 @@ define void @v_shuffle_v4i32_v3i32__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__1_3_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__1_3_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4242,11 +4315,12 @@ define void @v_shuffle_v4i32_v3i32__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4254,11 +4328,12 @@ define void @v_shuffle_v4i32_v3i32__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4266,11 +4341,12 @@ define void @v_shuffle_v4i32_v3i32__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4395,36 +4471,39 @@ define void @v_shuffle_v4i32_v3i32__5_3_3_3(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i32_v3i32__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_u_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_u_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_u_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND @@ -4432,7 +4511,7 @@ define void @v_shuffle_v4i32_v3i32__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -4726,43 +4805,47 @@ define void @v_shuffle_v4i32_v3i32__5_5_3_3(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i32_v3i32__5_5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_u_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_u_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_u_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5375,29 +5458,32 @@ define void @v_shuffle_v4i32_v3i32__5_u_4_4(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_u_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_u_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -5684,40 +5770,45 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6083,8 +6174,9 @@ define void @v_shuffle_v4i32_v3i32__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] @@ -6101,6 +6193,7 @@ define void @v_shuffle_v4i32_v3i32__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] @@ -6237,29 +6330,31 @@ define void @v_shuffle_v4i32_v3i32__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__4_5_5_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__4_5_5_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6605,13 +6700,14 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6619,13 +6715,14 @@ define void @v_shuffle_v4i32_v3i32__5_5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -6721,9 +6818,11 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6739,9 +6838,11 @@ define void @v_shuffle_v4i32_v3i32__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -6877,6 +6978,7 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] @@ -6891,6 +6993,7 @@ define void @v_shuffle_v4i32_v3i32__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; ; GFX942-LABEL: v_shuffle_v4i32_v3i32__5_5_4_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] @@ -6966,6 +7069,7 @@ define void @s_shuffle_v4i32_v3i32__0_u_u_u() { define void @s_shuffle_v4i32_v3i32__1_u_u_u() { ; GFX900-LABEL: s_shuffle_v4i32_v3i32__1_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -6978,6 +7082,7 @@ define void @s_shuffle_v4i32_v3i32__1_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v3i32__1_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -6990,6 +7095,7 @@ define void @s_shuffle_v4i32_v3i32__1_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v3i32__1_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7008,6 +7114,7 @@ define void @s_shuffle_v4i32_v3i32__1_u_u_u() { define void @s_shuffle_v4i32_v3i32__2_u_u_u() { ; GFX900-LABEL: s_shuffle_v4i32_v3i32__2_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -7020,6 +7127,7 @@ define void @s_shuffle_v4i32_v3i32__2_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v3i32__2_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -7032,6 +7140,7 @@ define void @s_shuffle_v4i32_v3i32__2_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v3i32__2_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7064,6 +7173,7 @@ define void @s_shuffle_v4i32_v3i32__3_u_u_u() { define void @s_shuffle_v4i32_v3i32__4_u_u_u() { ; GFX900-LABEL: s_shuffle_v4i32_v3i32__4_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -7076,6 +7186,7 @@ define void @s_shuffle_v4i32_v3i32__4_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v3i32__4_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -7088,6 +7199,7 @@ define void @s_shuffle_v4i32_v3i32__4_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v3i32__4_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7107,6 +7219,7 @@ define void @s_shuffle_v4i32_v3i32__4_u_u_u() { define void @s_shuffle_v4i32_v3i32__5_u_u_u() { ; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -7119,6 +7232,7 @@ define void @s_shuffle_v4i32_v3i32__5_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -7131,6 +7245,7 @@ define void @s_shuffle_v4i32_v3i32__5_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v3i32__5_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7150,14 +7265,15 @@ define void @s_shuffle_v4i32_v3i32__5_u_u_u() { define void @s_shuffle_v4i32_v3i32__5_0_u_u() { ; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_0_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s9, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -7166,14 +7282,15 @@ define void @s_shuffle_v4i32_v3i32__5_0_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_0_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s9, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -7182,6 +7299,7 @@ define void @s_shuffle_v4i32_v3i32__5_0_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v3i32__5_0_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7257,14 +7375,15 @@ define void @s_shuffle_v4i32_v3i32__5_1_u_u() { define void @s_shuffle_v4i32_v3i32__5_2_u_u() { ; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_2_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s9, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -7273,14 +7392,15 @@ define void @s_shuffle_v4i32_v3i32__5_2_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s9, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -7289,6 +7409,7 @@ define void @s_shuffle_v4i32_v3i32__5_2_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v3i32__5_2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7312,6 +7433,7 @@ define void @s_shuffle_v4i32_v3i32__5_2_u_u() { define void @s_shuffle_v4i32_v3i32__5_3_u_u() { ; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_3_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -7325,6 +7447,7 @@ define void @s_shuffle_v4i32_v3i32__5_3_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_3_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -7338,6 +7461,7 @@ define void @s_shuffle_v4i32_v3i32__5_3_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v3i32__5_3_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7377,6 +7501,7 @@ define void @s_shuffle_v4i32_v3i32__5_4_u_u() { define void @s_shuffle_v4i32_v3i32__5_5_u_u() { ; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -7390,6 +7515,7 @@ define void @s_shuffle_v4i32_v3i32__5_5_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -7403,6 +7529,7 @@ define void @s_shuffle_v4i32_v3i32__5_5_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v3i32__5_5_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7423,15 +7550,16 @@ define void @s_shuffle_v4i32_v3i32__5_5_u_u() { define void @s_shuffle_v4i32_v3i32__5_5_0_u() { ; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_0_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s14 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -7440,15 +7568,16 @@ define void @s_shuffle_v4i32_v3i32__5_5_0_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_0_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s14 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -7457,6 +7586,7 @@ define void @s_shuffle_v4i32_v3i32__5_5_0_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v3i32__5_5_0_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7481,15 +7611,16 @@ define void @s_shuffle_v4i32_v3i32__5_5_0_u() { define void @s_shuffle_v4i32_v3i32__5_5_1_u() { ; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_1_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s14 ; GFX900-NEXT: s_mov_b32 s10, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -7498,15 +7629,16 @@ define void @s_shuffle_v4i32_v3i32__5_5_1_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_1_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s14 ; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -7515,6 +7647,7 @@ define void @s_shuffle_v4i32_v3i32__5_5_1_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v3i32__5_5_1_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7594,6 +7727,7 @@ define void @s_shuffle_v4i32_v3i32__5_5_2_u() { define void @s_shuffle_v4i32_v3i32__5_5_3_u() { ; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -7608,6 +7742,7 @@ define void @s_shuffle_v4i32_v3i32__5_5_3_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -7622,6 +7757,7 @@ define void @s_shuffle_v4i32_v3i32__5_5_3_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v3i32__5_5_3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7643,6 +7779,7 @@ define void @s_shuffle_v4i32_v3i32__5_5_3_u() { define void @s_shuffle_v4i32_v3i32__5_5_4_u() { ; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_4_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -7657,6 +7794,7 @@ define void @s_shuffle_v4i32_v3i32__5_5_4_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_4_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -7671,6 +7809,7 @@ define void @s_shuffle_v4i32_v3i32__5_5_4_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v3i32__5_5_4_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -8011,6 +8150,7 @@ define void @s_shuffle_v4i32_v3i32__5_5_5_5() { define void @s_shuffle_v4i32_v3i32__u_0_0_0() { ; GFX900-LABEL: s_shuffle_v4i32_v3i32__u_0_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -8025,6 +8165,7 @@ define void @s_shuffle_v4i32_v3i32__u_0_0_0() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v3i32__u_0_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -8039,6 +8180,7 @@ define void @s_shuffle_v4i32_v3i32__u_0_0_0() { ; ; GFX942-LABEL: s_shuffle_v4i32_v3i32__u_0_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -8181,6 +8323,7 @@ define void @s_shuffle_v4i32_v3i32__2_0_0_0() { define void @s_shuffle_v4i32_v3i32__3_0_0_0() { ; GFX900-LABEL: s_shuffle_v4i32_v3i32__3_0_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -8195,6 +8338,7 @@ define void @s_shuffle_v4i32_v3i32__3_0_0_0() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v3i32__3_0_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -8209,6 +8353,7 @@ define void @s_shuffle_v4i32_v3i32__3_0_0_0() { ; ; GFX942-LABEL: s_shuffle_v4i32_v3i32__3_0_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -8351,14 +8496,15 @@ define void @s_shuffle_v4i32_v3i32__5_0_0_0() { define void @s_shuffle_v4i32_v3i32__5_u_0_0() { ; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_u_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s4 ; GFX900-NEXT: ;;#ASMSTART @@ -8368,14 +8514,15 @@ define void @s_shuffle_v4i32_v3i32__5_u_0_0() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_u_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s4 ; GFX90A-NEXT: ;;#ASMSTART @@ -8385,6 +8532,7 @@ define void @s_shuffle_v4i32_v3i32__5_u_0_0() { ; ; GFX942-LABEL: s_shuffle_v4i32_v3i32__5_u_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -8711,15 +8859,16 @@ define void @s_shuffle_v4i32_v3i32__5_5_0_0() { define void @s_shuffle_v4i32_v3i32__5_5_u_0() { ; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_u_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s14 ; GFX900-NEXT: s_mov_b32 s11, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -8728,15 +8877,16 @@ define void @s_shuffle_v4i32_v3i32__5_5_u_0() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_u_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s14 ; GFX90A-NEXT: s_mov_b32 s11, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -8745,6 +8895,7 @@ define void @s_shuffle_v4i32_v3i32__5_5_u_0() { ; ; GFX942-LABEL: s_shuffle_v4i32_v3i32__5_5_u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -9226,14 +9377,15 @@ define void @s_shuffle_v4i32_v3i32__5_1_1_1() { define void @s_shuffle_v4i32_v3i32__5_u_1_1() { ; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_u_1_1: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s10, s5 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -9243,14 +9395,15 @@ define void @s_shuffle_v4i32_v3i32__5_u_1_1() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_u_1_1: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -9260,6 +9413,7 @@ define void @s_shuffle_v4i32_v3i32__5_u_1_1() { ; ; GFX942-LABEL: s_shuffle_v4i32_v3i32__5_u_1_1: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -9586,15 +9740,16 @@ define void @s_shuffle_v4i32_v3i32__5_5_1_1() { define void @s_shuffle_v4i32_v3i32__5_5_u_1() { ; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_u_1: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s14 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -9603,15 +9758,16 @@ define void @s_shuffle_v4i32_v3i32__5_5_u_1() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_u_1: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s14 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -9620,6 +9776,7 @@ define void @s_shuffle_v4i32_v3i32__5_5_u_1() { ; ; GFX942-LABEL: s_shuffle_v4i32_v3i32__5_5_u_1: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -10446,15 +10603,16 @@ define void @s_shuffle_v4i32_v3i32__5_5_2_2() { define void @s_shuffle_v4i32_v3i32__5_5_u_2() { ; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s14 ; GFX900-NEXT: s_mov_b32 s11, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -10463,15 +10621,16 @@ define void @s_shuffle_v4i32_v3i32__5_5_u_2() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s14 ; GFX90A-NEXT: s_mov_b32 s11, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -10480,6 +10639,7 @@ define void @s_shuffle_v4i32_v3i32__5_5_u_2() { ; ; GFX942-LABEL: s_shuffle_v4i32_v3i32__5_5_u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -10802,6 +10962,7 @@ define void @s_shuffle_v4i32_v3i32__0_3_3_3() { define void @s_shuffle_v4i32_v3i32__1_3_3_3() { ; GFX900-LABEL: s_shuffle_v4i32_v3i32__1_3_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -10814,6 +10975,7 @@ define void @s_shuffle_v4i32_v3i32__1_3_3_3() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v3i32__1_3_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -10826,6 +10988,7 @@ define void @s_shuffle_v4i32_v3i32__1_3_3_3() { ; ; GFX942-LABEL: s_shuffle_v4i32_v3i32__1_3_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -10844,6 +11007,7 @@ define void @s_shuffle_v4i32_v3i32__1_3_3_3() { define void @s_shuffle_v4i32_v3i32__2_3_3_3() { ; GFX900-LABEL: s_shuffle_v4i32_v3i32__2_3_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -10856,6 +11020,7 @@ define void @s_shuffle_v4i32_v3i32__2_3_3_3() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v3i32__2_3_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -10868,6 +11033,7 @@ define void @s_shuffle_v4i32_v3i32__2_3_3_3() { ; ; GFX942-LABEL: s_shuffle_v4i32_v3i32__2_3_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -11004,6 +11170,7 @@ define void @s_shuffle_v4i32_v3i32__5_3_3_3() { define void @s_shuffle_v4i32_v3i32__5_u_3_3() { ; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_u_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -11018,6 +11185,7 @@ define void @s_shuffle_v4i32_v3i32__5_u_3_3() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_u_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -11032,6 +11200,7 @@ define void @s_shuffle_v4i32_v3i32__5_u_3_3() { ; ; GFX942-LABEL: s_shuffle_v4i32_v3i32__5_u_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -11337,6 +11506,7 @@ define void @s_shuffle_v4i32_v3i32__5_5_3_3() { define void @s_shuffle_v4i32_v3i32__5_5_u_3() { ; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_u_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -11351,6 +11521,7 @@ define void @s_shuffle_v4i32_v3i32__5_5_u_3() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_u_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -11365,6 +11536,7 @@ define void @s_shuffle_v4i32_v3i32__5_5_u_3() { ; ; GFX942-LABEL: s_shuffle_v4i32_v3i32__5_5_u_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -11874,6 +12046,7 @@ define void @s_shuffle_v4i32_v3i32__5_4_4_4() { define void @s_shuffle_v4i32_v3i32__5_u_4_4() { ; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_u_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -11888,6 +12061,7 @@ define void @s_shuffle_v4i32_v3i32__5_u_4_4() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_u_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -11902,6 +12076,7 @@ define void @s_shuffle_v4i32_v3i32__5_u_4_4() { ; ; GFX942-LABEL: s_shuffle_v4i32_v3i32__5_u_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -12207,6 +12382,7 @@ define void @s_shuffle_v4i32_v3i32__5_5_4_4() { define void @s_shuffle_v4i32_v3i32__5_5_u_4() { ; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_u_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -12221,6 +12397,7 @@ define void @s_shuffle_v4i32_v3i32__5_5_u_4() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -12235,6 +12412,7 @@ define void @s_shuffle_v4i32_v3i32__5_5_u_4() { ; ; GFX942-LABEL: s_shuffle_v4i32_v3i32__5_5_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -12989,6 +13167,7 @@ define void @s_shuffle_v4i32_v3i32__5_4_5_5() { define void @s_shuffle_v4i32_v3i32__5_5_u_5() { ; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_u_5: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -13003,6 +13182,7 @@ define void @s_shuffle_v4i32_v3i32__5_5_u_5() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_u_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -13017,6 +13197,7 @@ define void @s_shuffle_v4i32_v3i32__5_5_u_5() { ; ; GFX942-LABEL: s_shuffle_v4i32_v3i32__5_5_u_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll index 983afa566e2c1..3010c1c411a32 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll @@ -61,9 +61,10 @@ define void @v_shuffle_v4i32_v4i32__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -73,9 +74,10 @@ define void @v_shuffle_v4i32_v4i32__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -85,9 +87,10 @@ define void @v_shuffle_v4i32_v4i32__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -103,33 +106,37 @@ define void @v_shuffle_v4i32_v4i32__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__2_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__2_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -141,37 +148,41 @@ define void @v_shuffle_v4i32_v4i32__2_u_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i32_v4i32__3_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__3_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__3_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__3_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -198,9 +209,10 @@ define void @v_shuffle_v4i32_v4i32__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -210,9 +222,10 @@ define void @v_shuffle_v4i32_v4i32__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -222,9 +235,10 @@ define void @v_shuffle_v4i32_v4i32__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -241,33 +255,37 @@ define void @v_shuffle_v4i32_v4i32__6_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__6_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__6_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -280,37 +298,41 @@ define void @v_shuffle_v4i32_v4i32__6_u_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i32_v4i32__7_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -327,13 +349,14 @@ define void @v_shuffle_v4i32_v4i32__7_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7_vgpr8 +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v9, v[5:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -434,13 +457,14 @@ define void @v_shuffle_v4i32_v4i32__7_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ; implicit-def: $vgpr7_vgpr8_vgpr9_vgpr10 +; GFX900-NEXT: v_mov_b32_e32 v11, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v7, v6 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: global_store_dwordx4 v11, v[7:10], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -484,49 +508,53 @@ define void @v_shuffle_v4i32_v4i32__7_2_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i32_v4i32__7_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_3_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v7 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_3_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v7 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_3_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v7 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -539,14 +567,15 @@ define void @v_shuffle_v4i32_v4i32__7_3_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i32_v4i32__7_4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_4_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -626,14 +655,15 @@ define void @v_shuffle_v4i32_v4i32__7_5_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i32_v4i32__7_6_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_6_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -670,40 +700,44 @@ define void @v_shuffle_v4i32_v4i32__7_6_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i32_v4i32__7_7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -720,14 +754,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7_vgpr8 +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v0 +; GFX900-NEXT: global_store_dwordx4 v9, v[5:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -737,14 +772,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -754,15 +790,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -779,14 +815,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -796,14 +833,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -813,15 +851,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_1_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -890,52 +928,56 @@ define void @v_shuffle_v4i32_v4i32__7_7_2_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i32_v4i32__7_7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v7 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v10, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v7 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v10, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v7 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: v_mov_b32_e32 v10, v3 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -948,41 +990,47 @@ define void @v_shuffle_v4i32_v4i32__7_7_3_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i32_v4i32__7_7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_4_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_4_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -995,40 +1043,47 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i32_v4i32__7_7_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_5_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_5_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1087,43 +1142,47 @@ define void @v_shuffle_v4i32_v4i32__7_7_6_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i32_v4i32__7_7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_7_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_7_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_7_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1578,11 +1637,12 @@ define void @v_shuffle_v4i32_v4i32__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1592,11 +1652,12 @@ define void @v_shuffle_v4i32_v4i32__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1605,12 +1666,13 @@ define void @v_shuffle_v4i32_v4i32__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1820,11 +1882,12 @@ define void @v_shuffle_v4i32_v4i32__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1834,11 +1897,12 @@ define void @v_shuffle_v4i32_v4i32__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1848,11 +1912,12 @@ define void @v_shuffle_v4i32_v4i32__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2050,14 +2115,15 @@ define void @v_shuffle_v4i32_v4i32__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7_vgpr8 +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v0 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: global_store_dwordx4 v9, v[5:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2067,14 +2133,15 @@ define void @v_shuffle_v4i32_v4i32__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2084,15 +2151,15 @@ define void @v_shuffle_v4i32_v4i32__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2529,14 +2596,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7_vgpr8 +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: global_store_dwordx4 v9, v[5:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2546,14 +2614,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2563,15 +2632,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -3400,14 +3469,15 @@ define void @v_shuffle_v4i32_v4i32__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v1 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3417,14 +3487,15 @@ define void @v_shuffle_v4i32_v4i32__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3434,15 +3505,15 @@ define void @v_shuffle_v4i32_v4i32__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -3877,14 +3948,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3894,14 +3966,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3911,15 +3984,15 @@ define void @v_shuffle_v4i32_v4i32__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -5196,48 +5269,52 @@ define void @v_shuffle_v4i32_v4i32__7_7_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr7_vgpr8_vgpr9_vgpr10 +; GFX900-NEXT: v_mov_b32_e32 v11, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v7, v6 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v10, v2 +; GFX900-NEXT: global_store_dwordx4 v11, v[7:10], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v7 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v11, v2 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v7 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: v_mov_b32_e32 v11, v2 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -6974,9 +7051,10 @@ define void @v_shuffle_v4i32_v4i32__1_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6986,9 +7064,10 @@ define void @v_shuffle_v4i32_v4i32__1_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6998,9 +7077,10 @@ define void @v_shuffle_v4i32_v4i32__1_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7016,33 +7096,37 @@ define void @v_shuffle_v4i32_v4i32__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__2_4_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__2_4_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7054,37 +7138,41 @@ define void @v_shuffle_v4i32_v4i32__2_4_4_4(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i32_v4i32__3_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__3_4_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__3_4_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__3_4_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7256,43 +7344,47 @@ define void @v_shuffle_v4i32_v4i32__7_4_4_4(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i32_v4i32__7_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_u_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_u_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_u_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -7694,41 +7786,47 @@ define void @v_shuffle_v4i32_v4i32__7_7_4_4(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i32_v4i32__7_7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_u_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -8548,43 +8646,47 @@ define void @v_shuffle_v4i32_v4i32__7_5_5_5(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i32_v4i32__7_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_u_5_5: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_u_5_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_u_5_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -8980,40 +9082,47 @@ define void @v_shuffle_v4i32_v4i32__7_7_5_5(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i32_v4i32__7_7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_u_5: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_u_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_u_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -10248,43 +10357,47 @@ define void @v_shuffle_v4i32_v4i32__7_7_6_6(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i32_v4i32__7_7_u_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_u_6: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_u_6: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i32_v4i32__7_7_u_6: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -11967,6 +12080,7 @@ define void @s_shuffle_v4i32_v4i32__0_u_u_u() { define void @s_shuffle_v4i32_v4i32__1_u_u_u() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__1_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -11979,6 +12093,7 @@ define void @s_shuffle_v4i32_v4i32__1_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__1_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -11991,6 +12106,7 @@ define void @s_shuffle_v4i32_v4i32__1_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__1_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12009,6 +12125,7 @@ define void @s_shuffle_v4i32_v4i32__1_u_u_u() { define void @s_shuffle_v4i32_v4i32__2_u_u_u() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__2_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12021,6 +12138,7 @@ define void @s_shuffle_v4i32_v4i32__2_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__2_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12033,6 +12151,7 @@ define void @s_shuffle_v4i32_v4i32__2_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__2_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12051,6 +12170,7 @@ define void @s_shuffle_v4i32_v4i32__2_u_u_u() { define void @s_shuffle_v4i32_v4i32__3_u_u_u() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__3_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12063,6 +12183,7 @@ define void @s_shuffle_v4i32_v4i32__3_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__3_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12075,6 +12196,7 @@ define void @s_shuffle_v4i32_v4i32__3_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__3_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12107,6 +12229,7 @@ define void @s_shuffle_v4i32_v4i32__4_u_u_u() { define void @s_shuffle_v4i32_v4i32__5_u_u_u() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__5_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12119,6 +12242,7 @@ define void @s_shuffle_v4i32_v4i32__5_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__5_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12131,6 +12255,7 @@ define void @s_shuffle_v4i32_v4i32__5_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__5_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12150,6 +12275,7 @@ define void @s_shuffle_v4i32_v4i32__5_u_u_u() { define void @s_shuffle_v4i32_v4i32__6_u_u_u() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__6_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12162,6 +12288,7 @@ define void @s_shuffle_v4i32_v4i32__6_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__6_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12174,6 +12301,7 @@ define void @s_shuffle_v4i32_v4i32__6_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__6_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12193,6 +12321,7 @@ define void @s_shuffle_v4i32_v4i32__6_u_u_u() { define void @s_shuffle_v4i32_v4i32__7_u_u_u() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12205,6 +12334,7 @@ define void @s_shuffle_v4i32_v4i32__7_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12217,6 +12347,7 @@ define void @s_shuffle_v4i32_v4i32__7_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__7_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12236,14 +12367,15 @@ define void @s_shuffle_v4i32_v4i32__7_u_u_u() { define void @s_shuffle_v4i32_v4i32__7_0_u_u() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_0_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 ; GFX900-NEXT: s_mov_b32 s9, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -12252,14 +12384,15 @@ define void @s_shuffle_v4i32_v4i32__7_0_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_0_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 ; GFX90A-NEXT: s_mov_b32 s9, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -12268,6 +12401,7 @@ define void @s_shuffle_v4i32_v4i32__7_0_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__7_0_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12343,14 +12477,15 @@ define void @s_shuffle_v4i32_v4i32__7_1_u_u() { define void @s_shuffle_v4i32_v4i32__7_2_u_u() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_2_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 ; GFX900-NEXT: s_mov_b32 s9, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -12359,14 +12494,15 @@ define void @s_shuffle_v4i32_v4i32__7_2_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 ; GFX90A-NEXT: s_mov_b32 s9, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -12375,6 +12511,7 @@ define void @s_shuffle_v4i32_v4i32__7_2_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__7_2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12398,14 +12535,15 @@ define void @s_shuffle_v4i32_v4i32__7_2_u_u() { define void @s_shuffle_v4i32_v4i32__7_3_u_u() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_3_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -12414,14 +12552,15 @@ define void @s_shuffle_v4i32_v4i32__7_3_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_3_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -12430,6 +12569,7 @@ define void @s_shuffle_v4i32_v4i32__7_3_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__7_3_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12453,6 +12593,7 @@ define void @s_shuffle_v4i32_v4i32__7_3_u_u() { define void @s_shuffle_v4i32_v4i32__7_4_u_u() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_4_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12466,6 +12607,7 @@ define void @s_shuffle_v4i32_v4i32__7_4_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_4_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12479,6 +12621,7 @@ define void @s_shuffle_v4i32_v4i32__7_4_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__7_4_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12518,6 +12661,7 @@ define void @s_shuffle_v4i32_v4i32__7_5_u_u() { define void @s_shuffle_v4i32_v4i32__7_6_u_u() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_6_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12531,6 +12675,7 @@ define void @s_shuffle_v4i32_v4i32__7_6_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_6_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12544,6 +12689,7 @@ define void @s_shuffle_v4i32_v4i32__7_6_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__7_6_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12564,6 +12710,7 @@ define void @s_shuffle_v4i32_v4i32__7_6_u_u() { define void @s_shuffle_v4i32_v4i32__7_7_u_u() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12577,6 +12724,7 @@ define void @s_shuffle_v4i32_v4i32__7_7_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12590,6 +12738,7 @@ define void @s_shuffle_v4i32_v4i32__7_7_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__7_7_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12610,15 +12759,16 @@ define void @s_shuffle_v4i32_v4i32__7_7_u_u() { define void @s_shuffle_v4i32_v4i32__7_7_0_u() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_0_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -12627,15 +12777,16 @@ define void @s_shuffle_v4i32_v4i32__7_7_0_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_0_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -12644,6 +12795,7 @@ define void @s_shuffle_v4i32_v4i32__7_7_0_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__7_7_0_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12668,15 +12820,16 @@ define void @s_shuffle_v4i32_v4i32__7_7_0_u() { define void @s_shuffle_v4i32_v4i32__7_7_1_u() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_1_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s10, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -12685,15 +12838,16 @@ define void @s_shuffle_v4i32_v4i32__7_7_1_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_1_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -12702,6 +12856,7 @@ define void @s_shuffle_v4i32_v4i32__7_7_1_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__7_7_1_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12781,15 +12936,16 @@ define void @s_shuffle_v4i32_v4i32__7_7_2_u() { define void @s_shuffle_v4i32_v4i32__7_7_3_u() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s10, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -12798,15 +12954,16 @@ define void @s_shuffle_v4i32_v4i32__7_7_3_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s10, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -12815,6 +12972,7 @@ define void @s_shuffle_v4i32_v4i32__7_7_3_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__7_7_3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12839,6 +12997,7 @@ define void @s_shuffle_v4i32_v4i32__7_7_3_u() { define void @s_shuffle_v4i32_v4i32__7_7_4_u() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_4_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12853,6 +13012,7 @@ define void @s_shuffle_v4i32_v4i32__7_7_4_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_4_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12867,6 +13027,7 @@ define void @s_shuffle_v4i32_v4i32__7_7_4_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__7_7_4_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12888,6 +13049,7 @@ define void @s_shuffle_v4i32_v4i32__7_7_4_u() { define void @s_shuffle_v4i32_v4i32__7_7_5_u() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_5_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12902,6 +13064,7 @@ define void @s_shuffle_v4i32_v4i32__7_7_5_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_5_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12916,6 +13079,7 @@ define void @s_shuffle_v4i32_v4i32__7_7_5_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__7_7_5_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12957,6 +13121,7 @@ define void @s_shuffle_v4i32_v4i32__7_7_6_u() { define void @s_shuffle_v4i32_v4i32__7_7_7_u() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_7_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12971,6 +13136,7 @@ define void @s_shuffle_v4i32_v4i32__7_7_7_u() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_7_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12985,6 +13151,7 @@ define void @s_shuffle_v4i32_v4i32__7_7_7_u() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__7_7_7_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -13424,6 +13591,7 @@ define void @s_shuffle_v4i32_v4i32__7_7_7_7() { define void @s_shuffle_v4i32_v4i32__u_0_0_0() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__u_0_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -13438,6 +13606,7 @@ define void @s_shuffle_v4i32_v4i32__u_0_0_0() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__u_0_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -13452,6 +13621,7 @@ define void @s_shuffle_v4i32_v4i32__u_0_0_0() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__u_0_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -13645,6 +13815,7 @@ define void @s_shuffle_v4i32_v4i32__3_0_0_0() { define void @s_shuffle_v4i32_v4i32__4_0_0_0() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__4_0_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -13659,6 +13830,7 @@ define void @s_shuffle_v4i32_v4i32__4_0_0_0() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__4_0_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -13673,6 +13845,7 @@ define void @s_shuffle_v4i32_v4i32__4_0_0_0() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__4_0_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -13876,14 +14049,15 @@ define void @s_shuffle_v4i32_v4i32__7_0_0_0() { define void @s_shuffle_v4i32_v4i32__7_u_0_0() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_u_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s4 ; GFX900-NEXT: ;;#ASMSTART @@ -13893,14 +14067,15 @@ define void @s_shuffle_v4i32_v4i32__7_u_0_0() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_u_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s4 ; GFX90A-NEXT: ;;#ASMSTART @@ -13910,6 +14085,7 @@ define void @s_shuffle_v4i32_v4i32__7_u_0_0() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__7_u_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -14358,15 +14534,16 @@ define void @s_shuffle_v4i32_v4i32__7_7_0_0() { define void @s_shuffle_v4i32_v4i32__7_7_u_0() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_u_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s11, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -14375,15 +14552,16 @@ define void @s_shuffle_v4i32_v4i32__7_7_u_0() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_u_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s11, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -14392,6 +14570,7 @@ define void @s_shuffle_v4i32_v4i32__7_7_u_0() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__7_7_u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -15070,14 +15249,15 @@ define void @s_shuffle_v4i32_v4i32__7_1_1_1() { define void @s_shuffle_v4i32_v4i32__7_u_1_1() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_u_1_1: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 ; GFX900-NEXT: s_mov_b32 s10, s5 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -15087,14 +15267,15 @@ define void @s_shuffle_v4i32_v4i32__7_u_1_1() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_u_1_1: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 ; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -15104,6 +15285,7 @@ define void @s_shuffle_v4i32_v4i32__7_u_1_1() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__7_u_1_1: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -15552,15 +15734,16 @@ define void @s_shuffle_v4i32_v4i32__7_7_1_1() { define void @s_shuffle_v4i32_v4i32__7_7_u_1() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_u_1: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -15569,15 +15752,16 @@ define void @s_shuffle_v4i32_v4i32__7_7_u_1() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_u_1: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -15586,6 +15770,7 @@ define void @s_shuffle_v4i32_v4i32__7_7_u_1() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__7_7_u_1: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -16725,15 +16910,16 @@ define void @s_shuffle_v4i32_v4i32__7_7_2_2() { define void @s_shuffle_v4i32_v4i32__7_7_u_2() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s11, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -16742,15 +16928,16 @@ define void @s_shuffle_v4i32_v4i32__7_7_u_2() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s11, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -16759,6 +16946,7 @@ define void @s_shuffle_v4i32_v4i32__7_7_u_2() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__7_7_u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -18358,6 +18546,7 @@ define void @s_shuffle_v4i32_v4i32__0_4_4_4() { define void @s_shuffle_v4i32_v4i32__1_4_4_4() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__1_4_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -18370,6 +18559,7 @@ define void @s_shuffle_v4i32_v4i32__1_4_4_4() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__1_4_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -18382,6 +18572,7 @@ define void @s_shuffle_v4i32_v4i32__1_4_4_4() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__1_4_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -18400,6 +18591,7 @@ define void @s_shuffle_v4i32_v4i32__1_4_4_4() { define void @s_shuffle_v4i32_v4i32__2_4_4_4() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__2_4_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -18412,6 +18604,7 @@ define void @s_shuffle_v4i32_v4i32__2_4_4_4() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__2_4_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -18424,6 +18617,7 @@ define void @s_shuffle_v4i32_v4i32__2_4_4_4() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__2_4_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -18442,6 +18636,7 @@ define void @s_shuffle_v4i32_v4i32__2_4_4_4() { define void @s_shuffle_v4i32_v4i32__3_4_4_4() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__3_4_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -18454,6 +18649,7 @@ define void @s_shuffle_v4i32_v4i32__3_4_4_4() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__3_4_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -18466,6 +18662,7 @@ define void @s_shuffle_v4i32_v4i32__3_4_4_4() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__3_4_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -18654,6 +18851,7 @@ define void @s_shuffle_v4i32_v4i32__7_4_4_4() { define void @s_shuffle_v4i32_v4i32__7_u_4_4() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_u_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -18668,6 +18866,7 @@ define void @s_shuffle_v4i32_v4i32__7_u_4_4() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_u_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -18682,6 +18881,7 @@ define void @s_shuffle_v4i32_v4i32__7_u_4_4() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__7_u_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -19100,6 +19300,7 @@ define void @s_shuffle_v4i32_v4i32__7_7_4_4() { define void @s_shuffle_v4i32_v4i32__7_7_u_4() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_u_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -19114,6 +19315,7 @@ define void @s_shuffle_v4i32_v4i32__7_7_u_4() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -19128,6 +19330,7 @@ define void @s_shuffle_v4i32_v4i32__7_7_u_4() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__7_7_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -19829,6 +20032,7 @@ define void @s_shuffle_v4i32_v4i32__7_5_5_5() { define void @s_shuffle_v4i32_v4i32__7_u_5_5() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_u_5_5: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -19843,6 +20047,7 @@ define void @s_shuffle_v4i32_v4i32__7_u_5_5() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_u_5_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -19857,6 +20062,7 @@ define void @s_shuffle_v4i32_v4i32__7_u_5_5() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__7_u_5_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -20275,6 +20481,7 @@ define void @s_shuffle_v4i32_v4i32__7_7_5_5() { define void @s_shuffle_v4i32_v4i32__7_7_u_5() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_u_5: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -20289,6 +20496,7 @@ define void @s_shuffle_v4i32_v4i32__7_7_u_5() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_u_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -20303,6 +20511,7 @@ define void @s_shuffle_v4i32_v4i32__7_7_u_5() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__7_7_u_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -21349,6 +21558,7 @@ define void @s_shuffle_v4i32_v4i32__7_7_6_6() { define void @s_shuffle_v4i32_v4i32__7_7_u_6() { ; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_u_6: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -21363,6 +21573,7 @@ define void @s_shuffle_v4i32_v4i32__7_7_u_6() { ; ; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_u_6: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -21377,6 +21588,7 @@ define void @s_shuffle_v4i32_v4i32__7_7_u_6() { ; ; GFX942-LABEL: s_shuffle_v4i32_v4i32__7_7_u_6: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll index ac7d9557ce765..1903d1d833ac9 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll @@ -57,40 +57,44 @@ define void @v_shuffle_v4i64_v2i64__0_u_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i64_v2i64__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__1_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__1_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__1_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -113,40 +117,44 @@ define void @v_shuffle_v4i64_v2i64__2_u_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i64_v2i64__3_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -433,18 +441,19 @@ define void @v_shuffle_v4i64_v2i64__3_3_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -452,18 +461,19 @@ define void @v_shuffle_v4i64_v2i64__3_3_1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 ; GFX90A-NEXT: v_mov_b32_e32 v4, v6 ; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -471,18 +481,19 @@ define void @v_shuffle_v4i64_v2i64__3_3_1_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 ; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -551,11 +562,14 @@ define void @v_shuffle_v4i64_v2i64__3_3_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -565,11 +579,14 @@ define void @v_shuffle_v4i64_v2i64__3_3_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -579,11 +596,14 @@ define void @v_shuffle_v4i64_v2i64__3_3_3_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -844,11 +864,14 @@ define void @v_shuffle_v4i64_v2i64__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -858,11 +881,14 @@ define void @v_shuffle_v4i64_v2i64__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -872,11 +898,14 @@ define void @v_shuffle_v4i64_v2i64__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -995,11 +1024,14 @@ define void @v_shuffle_v4i64_v2i64__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1009,11 +1041,14 @@ define void @v_shuffle_v4i64_v2i64__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1023,11 +1058,14 @@ define void @v_shuffle_v4i64_v2i64__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1124,6 +1162,7 @@ define void @v_shuffle_v4i64_v2i64__3_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -1144,6 +1183,7 @@ define void @v_shuffle_v4i64_v2i64__3_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -1165,6 +1205,7 @@ define void @v_shuffle_v4i64_v2i64__3_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -1399,14 +1440,14 @@ define void @v_shuffle_v4i64_v2i64__3_3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1419,14 +1460,14 @@ define void @v_shuffle_v4i64_v2i64__3_3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1436,18 +1477,17 @@ define void @v_shuffle_v4i64_v2i64__3_3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -1871,10 +1911,11 @@ define void @v_shuffle_v4i64_v2i64__3_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -1891,10 +1932,11 @@ define void @v_shuffle_v4i64_v2i64__3_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -1911,10 +1953,11 @@ define void @v_shuffle_v4i64_v2i64__3_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -2374,40 +2417,44 @@ define void @v_shuffle_v4i64_v2i64__0_2_2_2(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i64_v2i64__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__1_2_2_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__1_2_2_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__1_2_2_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2487,54 +2534,61 @@ define void @v_shuffle_v4i64_v2i64__3_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_u_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 ; GFX900-NEXT: v_mov_b32_e32 v5, v1 ; GFX900-NEXT: v_mov_b32_e32 v6, v0 ; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_u_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v2i64__3_u_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -2760,13 +2814,14 @@ define void @v_shuffle_v4i64_v2i64__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2776,13 +2831,14 @@ define void @v_shuffle_v4i64_v2i64__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2792,13 +2848,14 @@ define void @v_shuffle_v4i64_v2i64__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -3177,11 +3234,14 @@ define void @v_shuffle_v4i64_v2i64__3_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3191,11 +3251,14 @@ define void @v_shuffle_v4i64_v2i64__3_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3205,11 +3268,14 @@ define void @v_shuffle_v4i64_v2i64__3_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i64> asm "; def $0", "=v"() @@ -3704,6 +3770,7 @@ define void @s_shuffle_v4i64_v2i64__0_u_u_u() { define void @s_shuffle_v4i64_v2i64__1_u_u_u() { ; GFX900-LABEL: s_shuffle_v4i64_v2i64__1_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -3717,6 +3784,7 @@ define void @s_shuffle_v4i64_v2i64__1_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i64_v2i64__1_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -3730,6 +3798,7 @@ define void @s_shuffle_v4i64_v2i64__1_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i64_v2i64__1_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -3763,6 +3832,7 @@ define void @s_shuffle_v4i64_v2i64__2_u_u_u() { define void @s_shuffle_v4i64_v2i64__3_u_u_u() { ; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -3776,6 +3846,7 @@ define void @s_shuffle_v4i64_v2i64__3_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -3789,6 +3860,7 @@ define void @s_shuffle_v4i64_v2i64__3_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -3809,15 +3881,16 @@ define void @s_shuffle_v4i64_v2i64__3_u_u_u() { define void @s_shuffle_v4i64_v2i64__3_0_u_u() { ; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_0_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[16:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -3827,15 +3900,16 @@ define void @s_shuffle_v4i64_v2i64__3_0_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_0_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[16:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -3845,6 +3919,7 @@ define void @s_shuffle_v4i64_v2i64__3_0_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_0_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -3925,6 +4000,7 @@ define void @s_shuffle_v4i64_v2i64__3_1_u_u() { define void @s_shuffle_v4i64_v2i64__3_2_u_u() { ; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_2_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -3940,6 +4016,7 @@ define void @s_shuffle_v4i64_v2i64__3_2_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -3955,6 +4032,7 @@ define void @s_shuffle_v4i64_v2i64__3_2_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -5415,6 +5493,7 @@ define void @s_shuffle_v4i64_v2i64__0_2_2_2() { define void @s_shuffle_v4i64_v2i64__1_2_2_2() { ; GFX900-LABEL: s_shuffle_v4i64_v2i64__1_2_2_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -5428,6 +5507,7 @@ define void @s_shuffle_v4i64_v2i64__1_2_2_2() { ; ; GFX90A-LABEL: s_shuffle_v4i64_v2i64__1_2_2_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -5441,6 +5521,7 @@ define void @s_shuffle_v4i64_v2i64__1_2_2_2() { ; ; GFX942-LABEL: s_shuffle_v4i64_v2i64__1_2_2_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -5636,6 +5717,7 @@ define void @s_shuffle_v4i64_v2i64__3_3_2_2() { define void @s_shuffle_v4i64_v2i64__3_3_u_2() { ; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -5653,6 +5735,7 @@ define void @s_shuffle_v4i64_v2i64__3_3_u_2() { ; ; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -5670,6 +5753,7 @@ define void @s_shuffle_v4i64_v2i64__3_3_u_2() { ; ; GFX942-LABEL: s_shuffle_v4i64_v2i64__3_3_u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll index 8dd4a40d00680..1840680f89805 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll @@ -18,37 +18,47 @@ define void @v_shuffle_v4i64_v3i64__u_u_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i64_v3i64__0_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__0_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__0_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__0_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -103,6 +113,7 @@ define void @v_shuffle_v4i64_v3i64__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -116,6 +127,7 @@ define void @v_shuffle_v4i64_v3i64__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -129,6 +141,7 @@ define void @v_shuffle_v4i64_v3i64__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -199,6 +212,7 @@ define void @v_shuffle_v4i64_v3i64__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -212,6 +226,7 @@ define void @v_shuffle_v4i64_v3i64__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -225,6 +240,7 @@ define void @v_shuffle_v4i64_v3i64__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -682,6 +698,7 @@ define void @v_shuffle_v4i64_v3i64__5_5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -701,6 +718,7 @@ define void @v_shuffle_v4i64_v3i64__5_5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -720,6 +738,7 @@ define void @v_shuffle_v4i64_v3i64__5_5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -847,6 +866,7 @@ define void @v_shuffle_v4i64_v3i64__5_5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -864,6 +884,7 @@ define void @v_shuffle_v4i64_v3i64__5_5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -881,6 +902,7 @@ define void @v_shuffle_v4i64_v3i64__5_5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -1280,7 +1302,10 @@ define void @v_shuffle_v4i64_v3i64__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1294,7 +1319,10 @@ define void @v_shuffle_v4i64_v3i64__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1308,7 +1336,10 @@ define void @v_shuffle_v4i64_v3i64__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1493,7 +1524,10 @@ define void @v_shuffle_v4i64_v3i64__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1507,7 +1541,10 @@ define void @v_shuffle_v4i64_v3i64__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1521,7 +1558,10 @@ define void @v_shuffle_v4i64_v3i64__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -1680,6 +1720,7 @@ define void @v_shuffle_v4i64_v3i64__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -1700,6 +1741,7 @@ define void @v_shuffle_v4i64_v3i64__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -1721,6 +1763,7 @@ define void @v_shuffle_v4i64_v3i64__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -2088,12 +2131,14 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -2107,12 +2152,14 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v6 ; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -2127,12 +2174,14 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mov_b32_e32 v4, v6 ; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -2812,9 +2861,10 @@ define void @v_shuffle_v4i64_v3i64__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v0, v8 ; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -2831,9 +2881,10 @@ define void @v_shuffle_v4i64_v3i64__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v0, v8 ; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -2851,9 +2902,10 @@ define void @v_shuffle_v4i64_v3i64__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v4, v2 ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v0, v8 ; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -3907,14 +3959,15 @@ define void @v_shuffle_v4i64_v3i64__5_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: v_mov_b32_e32 v0, v10 ; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -3926,14 +3979,15 @@ define void @v_shuffle_v4i64_v3i64__5_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: v_mov_b32_e32 v0, v10 ; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3945,14 +3999,16 @@ define void @v_shuffle_v4i64_v3i64__5_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v10 ; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -4630,37 +4686,47 @@ define void @v_shuffle_v4i64_v3i64__u_3_3_3(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4i64_v3i64__0_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__0_3_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__0_3_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4i64_v3i64__0_3_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -4715,6 +4781,7 @@ define void @v_shuffle_v4i64_v3i64__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -4728,6 +4795,7 @@ define void @v_shuffle_v4i64_v3i64__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -4741,6 +4809,7 @@ define void @v_shuffle_v4i64_v3i64__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -4894,6 +4963,7 @@ define void @v_shuffle_v4i64_v3i64__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -4911,6 +4981,7 @@ define void @v_shuffle_v4i64_v3i64__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -4928,6 +4999,7 @@ define void @v_shuffle_v4i64_v3i64__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -5270,14 +5342,14 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5287,14 +5359,14 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5304,14 +5376,14 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -5978,6 +6050,7 @@ define void @v_shuffle_v4i64_v3i64__5_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -5995,6 +6068,7 @@ define void @v_shuffle_v4i64_v3i64__5_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -6012,6 +6086,7 @@ define void @v_shuffle_v4i64_v3i64__5_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -7016,9 +7091,10 @@ define void @v_shuffle_v4i64_v3i64__5_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -7032,9 +7108,10 @@ define void @v_shuffle_v4i64_v3i64__5_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -7048,9 +7125,10 @@ define void @v_shuffle_v4i64_v3i64__5_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -7792,6 +7870,7 @@ define void @s_shuffle_v4i64_v3i64__1_u_u_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -7805,6 +7884,7 @@ define void @s_shuffle_v4i64_v3i64__1_u_u_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -7814,6 +7894,7 @@ define void @s_shuffle_v4i64_v3i64__1_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i64_v3i64__1_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -7835,8 +7916,11 @@ define void @s_shuffle_v4i64_v3i64__2_u_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -7846,8 +7930,11 @@ define void @s_shuffle_v4i64_v3i64__2_u_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -7855,6 +7942,7 @@ define void @s_shuffle_v4i64_v3i64__2_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i64_v3i64__2_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -7892,6 +7980,7 @@ define void @s_shuffle_v4i64_v3i64__4_u_u_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -7905,6 +7994,7 @@ define void @s_shuffle_v4i64_v3i64__4_u_u_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -7914,6 +8004,7 @@ define void @s_shuffle_v4i64_v3i64__4_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i64_v3i64__4_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -7936,8 +8027,11 @@ define void @s_shuffle_v4i64_v3i64__5_u_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -7947,8 +8041,11 @@ define void @s_shuffle_v4i64_v3i64__5_u_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -7956,6 +8053,7 @@ define void @s_shuffle_v4i64_v3i64__5_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -7981,10 +8079,11 @@ define void @s_shuffle_v4i64_v3i64__5_0_u_u() { ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -7999,10 +8098,11 @@ define void @s_shuffle_v4i64_v3i64__5_0_u_u() { ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -8014,12 +8114,15 @@ define void @s_shuffle_v4i64_v3i64__5_0_u_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s16 +; GFX942-NEXT: s_mov_b32 s9, s17 +; GFX942-NEXT: s_mov_b32 s10, s0 ; GFX942-NEXT: s_mov_b32 s11, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] @@ -8088,13 +8191,16 @@ define void @s_shuffle_v4i64_v3i64__5_2_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[20:25] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s8, s24 +; GFX900-NEXT: s_mov_b32 s9, s25 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -8104,13 +8210,16 @@ define void @s_shuffle_v4i64_v3i64__5_2_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[20:25] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s8, s24 +; GFX90A-NEXT: s_mov_b32 s9, s25 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -8120,13 +8229,14 @@ define void @s_shuffle_v4i64_v3i64__5_2_u_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 +; GFX942-NEXT: s_mov_b32 s8, s16 +; GFX942-NEXT: s_mov_b32 s9, s17 ; GFX942-NEXT: s_mov_b32 s10, s4 ; GFX942-NEXT: s_mov_b32 s11, s5 ; GFX942-NEXT: ;;#ASMSTART @@ -8143,12 +8253,15 @@ define void @s_shuffle_v4i64_v3i64__5_2_u_u() { define void @s_shuffle_v4i64_v3i64__5_3_u_u() { ; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_3_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[16:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s8, s20 +; GFX900-NEXT: s_mov_b32 s9, s21 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -8156,12 +8269,15 @@ define void @s_shuffle_v4i64_v3i64__5_3_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_3_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[16:21] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s8, s20 +; GFX90A-NEXT: s_mov_b32 s9, s21 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -8169,6 +8285,7 @@ define void @s_shuffle_v4i64_v3i64__5_3_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_3_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -8213,12 +8330,13 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -8228,12 +8346,13 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -8241,6 +8360,7 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -8268,12 +8388,13 @@ define void @s_shuffle_v4i64_v3i64__5_5_0_u() { ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: s_mov_b32 s12, s4 ; GFX900-NEXT: s_mov_b32 s13, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -8288,12 +8409,13 @@ define void @s_shuffle_v4i64_v3i64__5_5_0_u() { ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: s_mov_b32 s12, s4 ; GFX90A-NEXT: s_mov_b32 s13, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -8305,15 +8427,16 @@ define void @s_shuffle_v4i64_v3i64__5_5_0_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s8, s16 +; GFX942-NEXT: s_mov_b32 s9, s17 +; GFX942-NEXT: s_mov_b32 s10, s16 +; GFX942-NEXT: s_mov_b32 s11, s17 ; GFX942-NEXT: s_mov_b32 s12, s0 ; GFX942-NEXT: s_mov_b32 s13, s1 ; GFX942-NEXT: ;;#ASMSTART @@ -8335,12 +8458,13 @@ define void @s_shuffle_v4i64_v3i64__5_5_1_u() { ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: s_mov_b32 s12, s6 ; GFX900-NEXT: s_mov_b32 s13, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -8355,12 +8479,13 @@ define void @s_shuffle_v4i64_v3i64__5_5_1_u() { ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: s_mov_b32 s12, s6 ; GFX90A-NEXT: s_mov_b32 s13, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -8372,15 +8497,16 @@ define void @s_shuffle_v4i64_v3i64__5_5_1_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s8, s16 +; GFX942-NEXT: s_mov_b32 s9, s17 +; GFX942-NEXT: s_mov_b32 s10, s16 +; GFX942-NEXT: s_mov_b32 s11, s17 ; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART @@ -8458,14 +8584,17 @@ define void @s_shuffle_v4i64_v3i64__5_5_2_u() { define void @s_shuffle_v4i64_v3i64__5_5_3_u() { ; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[16:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: s_mov_b32 s8, s20 +; GFX900-NEXT: s_mov_b32 s9, s21 +; GFX900-NEXT: s_mov_b32 s10, s20 +; GFX900-NEXT: s_mov_b32 s11, s21 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -8473,14 +8602,17 @@ define void @s_shuffle_v4i64_v3i64__5_5_3_u() { ; ; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[16:21] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s8, s20 +; GFX90A-NEXT: s_mov_b32 s9, s21 +; GFX90A-NEXT: s_mov_b32 s10, s20 +; GFX90A-NEXT: s_mov_b32 s11, s21 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -8488,6 +8620,7 @@ define void @s_shuffle_v4i64_v3i64__5_5_3_u() { ; ; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -8512,16 +8645,17 @@ define void @s_shuffle_v4i64_v3i64__5_5_3_u() { define void @s_shuffle_v4i64_v3i64__5_5_4_u() { ; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_4_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[16:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s8, s20 +; GFX900-NEXT: s_mov_b32 s9, s21 +; GFX900-NEXT: s_mov_b32 s10, s20 +; GFX900-NEXT: s_mov_b32 s11, s21 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -8529,16 +8663,17 @@ define void @s_shuffle_v4i64_v3i64__5_5_4_u() { ; ; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_4_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[16:21] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s8, s20 +; GFX90A-NEXT: s_mov_b32 s9, s21 +; GFX90A-NEXT: s_mov_b32 s10, s20 +; GFX90A-NEXT: s_mov_b32 s11, s21 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -8546,6 +8681,7 @@ define void @s_shuffle_v4i64_v3i64__5_5_4_u() { ; ; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_4_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -8945,6 +9081,7 @@ define void @s_shuffle_v4i64_v3i64__u_0_0_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: s_mov_b32 s12, s4 @@ -8962,6 +9099,7 @@ define void @s_shuffle_v4i64_v3i64__u_0_0_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: s_mov_b32 s12, s4 @@ -8975,6 +9113,7 @@ define void @s_shuffle_v4i64_v3i64__u_0_0_0() { ; ; GFX942-LABEL: s_shuffle_v4i64_v3i64__u_0_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -9147,6 +9286,7 @@ define void @s_shuffle_v4i64_v3i64__3_0_0_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: s_mov_b32 s12, s4 @@ -9164,6 +9304,7 @@ define void @s_shuffle_v4i64_v3i64__3_0_0_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: s_mov_b32 s12, s4 @@ -9177,6 +9318,7 @@ define void @s_shuffle_v4i64_v3i64__3_0_0_0() { ; ; GFX942-LABEL: s_shuffle_v4i64_v3i64__3_0_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -9349,10 +9491,11 @@ define void @s_shuffle_v4i64_v3i64__5_u_0_0() { ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: s_mov_b32 s12, s4 ; GFX900-NEXT: s_mov_b32 s13, s5 ; GFX900-NEXT: s_mov_b32 s14, s4 @@ -9369,10 +9512,11 @@ define void @s_shuffle_v4i64_v3i64__5_u_0_0() { ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: s_mov_b32 s12, s4 ; GFX90A-NEXT: s_mov_b32 s13, s5 ; GFX90A-NEXT: s_mov_b32 s14, s4 @@ -9386,12 +9530,15 @@ define void @s_shuffle_v4i64_v3i64__5_u_0_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s16 +; GFX942-NEXT: s_mov_b32 s9, s17 +; GFX942-NEXT: s_mov_b32 s12, s0 ; GFX942-NEXT: s_mov_b32 s13, s1 ; GFX942-NEXT: s_mov_b32 s14, s0 ; GFX942-NEXT: s_mov_b32 s15, s1 @@ -9765,12 +9912,13 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_0() { ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: s_mov_b32 s14, s4 ; GFX900-NEXT: s_mov_b32 s15, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -9785,12 +9933,13 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_0() { ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: s_mov_b32 s14, s4 ; GFX90A-NEXT: s_mov_b32 s15, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -9802,15 +9951,16 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s8, s16 +; GFX942-NEXT: s_mov_b32 s9, s17 +; GFX942-NEXT: s_mov_b32 s10, s16 +; GFX942-NEXT: s_mov_b32 s11, s17 ; GFX942-NEXT: s_mov_b32 s14, s0 ; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART @@ -10357,10 +10507,11 @@ define void @s_shuffle_v4i64_v3i64__5_u_1_1() { ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: s_mov_b32 s12, s6 ; GFX900-NEXT: s_mov_b32 s13, s7 ; GFX900-NEXT: s_mov_b32 s14, s6 @@ -10377,10 +10528,11 @@ define void @s_shuffle_v4i64_v3i64__5_u_1_1() { ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: s_mov_b32 s12, s6 ; GFX90A-NEXT: s_mov_b32 s13, s7 ; GFX90A-NEXT: s_mov_b32 s14, s6 @@ -10394,12 +10546,15 @@ define void @s_shuffle_v4i64_v3i64__5_u_1_1() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s16 +; GFX942-NEXT: s_mov_b32 s9, s17 +; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: s_mov_b32 s14, s2 ; GFX942-NEXT: s_mov_b32 s15, s3 @@ -10769,12 +10924,13 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_1() { ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: s_mov_b32 s14, s6 ; GFX900-NEXT: s_mov_b32 s15, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -10789,12 +10945,13 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_1() { ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: s_mov_b32 s14, s6 ; GFX90A-NEXT: s_mov_b32 s15, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -10806,15 +10963,16 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_1() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s8, s16 +; GFX942-NEXT: s_mov_b32 s9, s17 +; GFX942-NEXT: s_mov_b32 s10, s16 +; GFX942-NEXT: s_mov_b32 s11, s17 ; GFX942-NEXT: s_mov_b32 s14, s2 ; GFX942-NEXT: s_mov_b32 s15, s3 ; GFX942-NEXT: ;;#ASMSTART @@ -11734,17 +11892,18 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_2() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ; def s[20:25] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s20 -; GFX900-NEXT: s_mov_b32 s9, s21 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: s_mov_b32 s8, s24 +; GFX900-NEXT: s_mov_b32 s9, s25 +; GFX900-NEXT: s_mov_b32 s10, s24 +; GFX900-NEXT: s_mov_b32 s11, s25 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -11754,17 +11913,18 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_2() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ; def s[20:25] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s20 -; GFX90A-NEXT: s_mov_b32 s9, s21 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: s_mov_b32 s8, s24 +; GFX90A-NEXT: s_mov_b32 s9, s25 +; GFX90A-NEXT: s_mov_b32 s10, s24 +; GFX90A-NEXT: s_mov_b32 s11, s25 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -11774,15 +11934,16 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_2() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s8, s16 +; GFX942-NEXT: s_mov_b32 s9, s17 +; GFX942-NEXT: s_mov_b32 s10, s16 +; GFX942-NEXT: s_mov_b32 s11, s17 ; GFX942-NEXT: s_mov_b32 s14, s4 ; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART @@ -12143,6 +12304,7 @@ define void @s_shuffle_v4i64_v3i64__1_3_3_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -12156,6 +12318,7 @@ define void @s_shuffle_v4i64_v3i64__1_3_3_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -12165,6 +12328,7 @@ define void @s_shuffle_v4i64_v3i64__1_3_3_3() { ; ; GFX942-LABEL: s_shuffle_v4i64_v3i64__1_3_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -12186,8 +12350,11 @@ define void @s_shuffle_v4i64_v3i64__2_3_3_3() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -12197,8 +12364,11 @@ define void @s_shuffle_v4i64_v3i64__2_3_3_3() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -12206,6 +12376,7 @@ define void @s_shuffle_v4i64_v3i64__2_3_3_3() { ; ; GFX942-LABEL: s_shuffle_v4i64_v3i64__2_3_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -12363,14 +12534,17 @@ define void @s_shuffle_v4i64_v3i64__5_3_3_3() { define void @s_shuffle_v4i64_v3i64__5_u_3_3() { ; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_u_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[16:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s8, s20 +; GFX900-NEXT: s_mov_b32 s9, s21 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -12378,14 +12552,17 @@ define void @s_shuffle_v4i64_v3i64__5_u_3_3() { ; ; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_u_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[16:21] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s8, s20 +; GFX90A-NEXT: s_mov_b32 s9, s21 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -12393,6 +12570,7 @@ define void @s_shuffle_v4i64_v3i64__5_u_3_3() { ; ; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_u_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -12744,16 +12922,17 @@ define void @s_shuffle_v4i64_v3i64__5_5_3_3() { define void @s_shuffle_v4i64_v3i64__5_5_u_3() { ; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_u_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[16:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: s_mov_b32 s8, s20 +; GFX900-NEXT: s_mov_b32 s9, s21 +; GFX900-NEXT: s_mov_b32 s10, s20 +; GFX900-NEXT: s_mov_b32 s11, s21 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -12761,16 +12940,17 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_3() { ; ; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_u_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[16:21] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: s_mov_b32 s8, s20 +; GFX90A-NEXT: s_mov_b32 s9, s21 +; GFX90A-NEXT: s_mov_b32 s10, s20 +; GFX90A-NEXT: s_mov_b32 s11, s21 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -12778,6 +12958,7 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_3() { ; ; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_u_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -13368,14 +13549,17 @@ define void @s_shuffle_v4i64_v3i64__5_4_4_4() { define void @s_shuffle_v4i64_v3i64__5_u_4_4() { ; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_u_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[16:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: s_mov_b32 s8, s20 +; GFX900-NEXT: s_mov_b32 s9, s21 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -13383,14 +13567,17 @@ define void @s_shuffle_v4i64_v3i64__5_u_4_4() { ; ; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_u_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[16:21] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: s_mov_b32 s8, s20 +; GFX90A-NEXT: s_mov_b32 s9, s21 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -13398,6 +13585,7 @@ define void @s_shuffle_v4i64_v3i64__5_u_4_4() { ; ; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_u_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -13739,14 +13927,17 @@ define void @s_shuffle_v4i64_v3i64__5_5_4_4() { define void @s_shuffle_v4i64_v3i64__5_5_u_4() { ; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_u_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[16:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: s_mov_b32 s8, s20 +; GFX900-NEXT: s_mov_b32 s9, s21 +; GFX900-NEXT: s_mov_b32 s10, s20 +; GFX900-NEXT: s_mov_b32 s11, s21 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -13754,14 +13945,17 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_4() { ; ; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[16:21] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s8, s20 +; GFX90A-NEXT: s_mov_b32 s9, s21 +; GFX90A-NEXT: s_mov_b32 s10, s20 +; GFX90A-NEXT: s_mov_b32 s11, s21 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -13769,6 +13963,7 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_4() { ; ; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -14630,14 +14825,15 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_5() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -14647,14 +14843,15 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_5() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -14662,6 +14859,7 @@ define void @s_shuffle_v4i64_v3i64__5_5_u_5() { ; ; GFX942-LABEL: s_shuffle_v4i64_v3i64__5_5_u_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll index ea9ef2f1ac94a..8118501d7ca79 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll @@ -142,6 +142,7 @@ define void @v_shuffle_v4i64_v4i64__3_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -155,6 +156,7 @@ define void @v_shuffle_v4i64_v4i64__3_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -168,6 +170,7 @@ define void @v_shuffle_v4i64_v4i64__3_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -278,6 +281,7 @@ define void @v_shuffle_v4i64_v4i64__7_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -291,6 +295,7 @@ define void @v_shuffle_v4i64_v4i64__7_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -304,6 +309,7 @@ define void @v_shuffle_v4i64_v4i64__7_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -918,16 +924,17 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: v_mov_b32_e32 v8, v6 ; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v10, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -940,6 +947,7 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -959,6 +967,7 @@ define void @v_shuffle_v4i64_v4i64__7_7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -1135,6 +1144,7 @@ define void @v_shuffle_v4i64_v4i64__7_7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -1151,6 +1161,7 @@ define void @v_shuffle_v4i64_v4i64__7_7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -1167,6 +1178,7 @@ define void @v_shuffle_v4i64_v4i64__7_7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -1685,7 +1697,10 @@ define void @v_shuffle_v4i64_v4i64__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1699,7 +1714,10 @@ define void @v_shuffle_v4i64_v4i64__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1713,7 +1731,10 @@ define void @v_shuffle_v4i64_v4i64__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -1949,7 +1970,10 @@ define void @v_shuffle_v4i64_v4i64__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1963,7 +1987,10 @@ define void @v_shuffle_v4i64_v4i64__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1977,7 +2004,10 @@ define void @v_shuffle_v4i64_v4i64__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i64> asm "; def $0", "=v"() @@ -2201,6 +2231,7 @@ define void @v_shuffle_v4i64_v4i64__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v8 ; GFX900-NEXT: v_mov_b32_e32 v1, v9 @@ -2221,6 +2252,7 @@ define void @v_shuffle_v4i64_v4i64__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v8 ; GFX90A-NEXT: v_mov_b32_e32 v1, v9 @@ -2242,6 +2274,7 @@ define void @v_shuffle_v4i64_v4i64__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mov_b32_e32 v0, v8 ; GFX942-NEXT: v_mov_b32_e32 v1, v9 @@ -2736,16 +2769,17 @@ define void @v_shuffle_v4i64_v4i64__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v0 -; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2758,12 +2792,13 @@ define void @v_shuffle_v4i64_v4i64__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: v_mov_b32_e32 v6, v8 ; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -2778,12 +2813,13 @@ define void @v_shuffle_v4i64_v4i64__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: v_mov_b32_e32 v6, v8 ; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -3706,9 +3742,10 @@ define void @v_shuffle_v4i64_v4i64__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v0, v10 ; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -3725,9 +3762,10 @@ define void @v_shuffle_v4i64_v4i64__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v0, v10 ; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3745,9 +3783,10 @@ define void @v_shuffle_v4i64_v4i64__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v4, v2 ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v0, v10 ; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -5184,6 +5223,7 @@ define void @v_shuffle_v4i64_v4i64__7_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: v_mov_b32_e32 v7, v5 @@ -5203,6 +5243,7 @@ define void @v_shuffle_v4i64_v4i64__7_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: v_mov_b32_e32 v6, v4 ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -5219,6 +5260,7 @@ define void @v_shuffle_v4i64_v4i64__7_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[6:13] @@ -6654,6 +6696,7 @@ define void @v_shuffle_v4i64_v4i64__7_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[8:15] @@ -6673,6 +6716,7 @@ define void @v_shuffle_v4i64_v4i64__7_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] @@ -6692,6 +6736,7 @@ define void @v_shuffle_v4i64_v4i64__7_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[8:15] @@ -7758,6 +7803,7 @@ define void @v_shuffle_v4i64_v4i64__3_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -7771,6 +7817,7 @@ define void @v_shuffle_v4i64_v4i64__3_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -7784,6 +7831,7 @@ define void @v_shuffle_v4i64_v4i64__3_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -7989,6 +8037,7 @@ define void @v_shuffle_v4i64_v4i64__7_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -8006,6 +8055,7 @@ define void @v_shuffle_v4i64_v4i64__7_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -8023,6 +8073,7 @@ define void @v_shuffle_v4i64_v4i64__7_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -8484,12 +8535,14 @@ define void @v_shuffle_v4i64_v4i64__7_7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -8500,12 +8553,14 @@ define void @v_shuffle_v4i64_v4i64__7_7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v6 ; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -8516,12 +8571,14 @@ define void @v_shuffle_v4i64_v4i64__7_7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mov_b32_e32 v4, v6 ; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -9424,9 +9481,10 @@ define void @v_shuffle_v4i64_v4i64__7_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -9440,9 +9498,10 @@ define void @v_shuffle_v4i64_v4i64__7_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -9456,9 +9515,10 @@ define void @v_shuffle_v4i64_v4i64__7_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v4, v2 ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -10855,9 +10915,10 @@ define void @v_shuffle_v4i64_v4i64__7_u_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -10871,9 +10932,10 @@ define void @v_shuffle_v4i64_v4i64__7_u_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10887,9 +10949,10 @@ define void @v_shuffle_v4i64_v4i64__7_u_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -12247,6 +12310,7 @@ define void @v_shuffle_v4i64_v4i64__7_u_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 @@ -12263,6 +12327,7 @@ define void @v_shuffle_v4i64_v4i64__7_u_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v6 ; GFX90A-NEXT: v_mov_b32_e32 v5, v7 @@ -12279,6 +12344,7 @@ define void @v_shuffle_v4i64_v4i64__7_u_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v4, v6 ; GFX942-NEXT: v_mov_b32_e32 v5, v7 @@ -13263,6 +13329,7 @@ define void @s_shuffle_v4i64_v4i64__1_u_u_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -13276,6 +13343,7 @@ define void @s_shuffle_v4i64_v4i64__1_u_u_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -13285,6 +13353,7 @@ define void @s_shuffle_v4i64_v4i64__1_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i64_v4i64__1_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13306,8 +13375,11 @@ define void @s_shuffle_v4i64_v4i64__2_u_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -13317,8 +13389,11 @@ define void @s_shuffle_v4i64_v4i64__2_u_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -13326,6 +13401,7 @@ define void @s_shuffle_v4i64_v4i64__2_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i64_v4i64__2_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13347,10 +13423,11 @@ define void @s_shuffle_v4i64_v4i64__3_u_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -13360,10 +13437,11 @@ define void @s_shuffle_v4i64_v4i64__3_u_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -13371,6 +13449,7 @@ define void @s_shuffle_v4i64_v4i64__3_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i64_v4i64__3_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13408,6 +13487,7 @@ define void @s_shuffle_v4i64_v4i64__5_u_u_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -13421,6 +13501,7 @@ define void @s_shuffle_v4i64_v4i64__5_u_u_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -13430,6 +13511,7 @@ define void @s_shuffle_v4i64_v4i64__5_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i64_v4i64__5_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13452,8 +13534,11 @@ define void @s_shuffle_v4i64_v4i64__6_u_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -13463,8 +13548,11 @@ define void @s_shuffle_v4i64_v4i64__6_u_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -13472,6 +13560,7 @@ define void @s_shuffle_v4i64_v4i64__6_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i64_v4i64__6_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13494,10 +13583,11 @@ define void @s_shuffle_v4i64_v4i64__7_u_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -13507,10 +13597,11 @@ define void @s_shuffle_v4i64_v4i64__7_u_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -13518,6 +13609,7 @@ define void @s_shuffle_v4i64_v4i64__7_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13543,10 +13635,11 @@ define void @s_shuffle_v4i64_v4i64__7_0_u_u() { ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -13561,10 +13654,11 @@ define void @s_shuffle_v4i64_v4i64__7_0_u_u() { ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -13576,14 +13670,14 @@ define void @s_shuffle_v4i64_v4i64__7_0_u_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 ; GFX942-NEXT: s_mov_b32 s10, s0 ; GFX942-NEXT: s_mov_b32 s11, s1 ; GFX942-NEXT: ;;#ASMSTART @@ -13657,15 +13751,16 @@ define void @s_shuffle_v4i64_v4i64__7_2_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[20:27] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s8, s26 +; GFX900-NEXT: s_mov_b32 s9, s27 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -13675,15 +13770,16 @@ define void @s_shuffle_v4i64_v4i64__7_2_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[20:27] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s8, s26 +; GFX90A-NEXT: s_mov_b32 s9, s27 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -13693,13 +13789,14 @@ define void @s_shuffle_v4i64_v4i64__7_2_u_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 ; GFX942-NEXT: s_mov_b32 s10, s4 ; GFX942-NEXT: s_mov_b32 s11, s5 ; GFX942-NEXT: ;;#ASMSTART @@ -13718,13 +13815,16 @@ define void @s_shuffle_v4i64_v4i64__7_3_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[20:27] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s8, s26 +; GFX900-NEXT: s_mov_b32 s9, s27 +; GFX900-NEXT: s_mov_b32 s10, s18 +; GFX900-NEXT: s_mov_b32 s11, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -13734,13 +13834,16 @@ define void @s_shuffle_v4i64_v4i64__7_3_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[20:27] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s8, s26 +; GFX90A-NEXT: s_mov_b32 s9, s27 +; GFX90A-NEXT: s_mov_b32 s10, s18 +; GFX90A-NEXT: s_mov_b32 s11, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -13750,13 +13853,14 @@ define void @s_shuffle_v4i64_v4i64__7_3_u_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 ; GFX942-NEXT: s_mov_b32 s10, s6 ; GFX942-NEXT: s_mov_b32 s11, s7 ; GFX942-NEXT: ;;#ASMSTART @@ -13773,14 +13877,15 @@ define void @s_shuffle_v4i64_v4i64__7_3_u_u() { define void @s_shuffle_v4i64_v4i64__7_4_u_u() { ; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -13788,14 +13893,15 @@ define void @s_shuffle_v4i64_v4i64__7_4_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -13803,6 +13909,7 @@ define void @s_shuffle_v4i64_v4i64__7_4_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_4_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13847,12 +13954,13 @@ define void @s_shuffle_v4i64_v4i64__7_6_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -13862,12 +13970,13 @@ define void @s_shuffle_v4i64_v4i64__7_6_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -13875,6 +13984,7 @@ define void @s_shuffle_v4i64_v4i64__7_6_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_6_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13899,10 +14009,13 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s18 +; GFX900-NEXT: s_mov_b32 s11, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -13912,10 +14025,13 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s18 +; GFX90A-NEXT: s_mov_b32 s11, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -13923,6 +14039,7 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_u() { ; ; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13950,12 +14067,13 @@ define void @s_shuffle_v4i64_v4i64__7_7_0_u() { ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s18 +; GFX900-NEXT: s_mov_b32 s11, s19 ; GFX900-NEXT: s_mov_b32 s12, s4 ; GFX900-NEXT: s_mov_b32 s13, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -13970,12 +14088,13 @@ define void @s_shuffle_v4i64_v4i64__7_7_0_u() { ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s18 +; GFX90A-NEXT: s_mov_b32 s11, s19 ; GFX90A-NEXT: s_mov_b32 s12, s4 ; GFX90A-NEXT: s_mov_b32 s13, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -13987,14 +14106,17 @@ define void @s_shuffle_v4i64_v4i64__7_7_0_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s10, s18 +; GFX942-NEXT: s_mov_b32 s11, s19 +; GFX942-NEXT: s_mov_b32 s12, s0 ; GFX942-NEXT: s_mov_b32 s13, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] @@ -14015,12 +14137,13 @@ define void @s_shuffle_v4i64_v4i64__7_7_1_u() { ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s18 +; GFX900-NEXT: s_mov_b32 s11, s19 ; GFX900-NEXT: s_mov_b32 s12, s6 ; GFX900-NEXT: s_mov_b32 s13, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -14035,12 +14158,13 @@ define void @s_shuffle_v4i64_v4i64__7_7_1_u() { ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s18 +; GFX90A-NEXT: s_mov_b32 s11, s19 ; GFX90A-NEXT: s_mov_b32 s12, s6 ; GFX90A-NEXT: s_mov_b32 s13, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -14052,14 +14176,17 @@ define void @s_shuffle_v4i64_v4i64__7_7_1_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s10, s18 +; GFX942-NEXT: s_mov_b32 s11, s19 +; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] @@ -14134,15 +14261,18 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[20:27] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s8, s26 +; GFX900-NEXT: s_mov_b32 s9, s27 +; GFX900-NEXT: s_mov_b32 s10, s26 +; GFX900-NEXT: s_mov_b32 s11, s27 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -14152,15 +14282,18 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[20:27] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s8, s26 +; GFX90A-NEXT: s_mov_b32 s9, s27 +; GFX90A-NEXT: s_mov_b32 s10, s26 +; GFX90A-NEXT: s_mov_b32 s11, s27 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -14170,15 +14303,16 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s10, s18 +; GFX942-NEXT: s_mov_b32 s11, s19 ; GFX942-NEXT: s_mov_b32 s12, s6 ; GFX942-NEXT: s_mov_b32 s13, s7 ; GFX942-NEXT: ;;#ASMSTART @@ -14195,14 +14329,17 @@ define void @s_shuffle_v4i64_v4i64__7_7_3_u() { define void @s_shuffle_v4i64_v4i64__7_7_4_u() { ; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s10, s22 +; GFX900-NEXT: s_mov_b32 s11, s23 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -14210,14 +14347,17 @@ define void @s_shuffle_v4i64_v4i64__7_7_4_u() { ; ; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s10, s22 +; GFX90A-NEXT: s_mov_b32 s11, s23 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -14225,6 +14365,7 @@ define void @s_shuffle_v4i64_v4i64__7_7_4_u() { ; ; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_4_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -14249,14 +14390,17 @@ define void @s_shuffle_v4i64_v4i64__7_7_4_u() { define void @s_shuffle_v4i64_v4i64__7_7_5_u() { ; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s10, s22 +; GFX900-NEXT: s_mov_b32 s11, s23 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -14264,14 +14408,17 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_u() { ; ; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s10, s22 +; GFX90A-NEXT: s_mov_b32 s11, s23 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -14279,6 +14426,7 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_u() { ; ; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_5_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -14327,14 +14475,15 @@ define void @s_shuffle_v4i64_v4i64__7_7_7_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s18 +; GFX900-NEXT: s_mov_b32 s11, s19 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -14344,14 +14493,15 @@ define void @s_shuffle_v4i64_v4i64__7_7_7_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s18 +; GFX90A-NEXT: s_mov_b32 s11, s19 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -14359,6 +14509,7 @@ define void @s_shuffle_v4i64_v4i64__7_7_7_u() { ; ; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_7_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -14885,6 +15036,7 @@ define void @s_shuffle_v4i64_v4i64__u_0_0_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: s_mov_b32 s12, s4 @@ -14902,6 +15054,7 @@ define void @s_shuffle_v4i64_v4i64__u_0_0_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: s_mov_b32 s12, s4 @@ -14915,6 +15068,7 @@ define void @s_shuffle_v4i64_v4i64__u_0_0_0() { ; ; GFX942-LABEL: s_shuffle_v4i64_v4i64__u_0_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -15150,6 +15304,7 @@ define void @s_shuffle_v4i64_v4i64__4_0_0_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: s_mov_b32 s12, s4 @@ -15167,6 +15322,7 @@ define void @s_shuffle_v4i64_v4i64__4_0_0_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: s_mov_b32 s12, s4 @@ -15180,6 +15336,7 @@ define void @s_shuffle_v4i64_v4i64__4_0_0_0() { ; ; GFX942-LABEL: s_shuffle_v4i64_v4i64__4_0_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -15425,10 +15582,11 @@ define void @s_shuffle_v4i64_v4i64__7_u_0_0() { ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 ; GFX900-NEXT: s_mov_b32 s12, s4 ; GFX900-NEXT: s_mov_b32 s13, s5 ; GFX900-NEXT: s_mov_b32 s14, s4 @@ -15445,10 +15603,11 @@ define void @s_shuffle_v4i64_v4i64__7_u_0_0() { ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 ; GFX90A-NEXT: s_mov_b32 s12, s4 ; GFX90A-NEXT: s_mov_b32 s13, s5 ; GFX90A-NEXT: s_mov_b32 s14, s4 @@ -15462,14 +15621,15 @@ define void @s_shuffle_v4i64_v4i64__7_u_0_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s12, s0 ; GFX942-NEXT: s_mov_b32 s13, s1 ; GFX942-NEXT: s_mov_b32 s14, s0 ; GFX942-NEXT: s_mov_b32 s15, s1 @@ -15991,12 +16151,13 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_0() { ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s18 +; GFX900-NEXT: s_mov_b32 s11, s19 ; GFX900-NEXT: s_mov_b32 s14, s4 ; GFX900-NEXT: s_mov_b32 s15, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -16011,12 +16172,13 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_0() { ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s18 +; GFX90A-NEXT: s_mov_b32 s11, s19 ; GFX90A-NEXT: s_mov_b32 s14, s4 ; GFX90A-NEXT: s_mov_b32 s15, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -16028,14 +16190,17 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s14, s0 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s10, s18 +; GFX942-NEXT: s_mov_b32 s11, s19 +; GFX942-NEXT: s_mov_b32 s14, s0 ; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] @@ -16803,10 +16968,11 @@ define void @s_shuffle_v4i64_v4i64__7_u_1_1() { ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 ; GFX900-NEXT: s_mov_b32 s12, s6 ; GFX900-NEXT: s_mov_b32 s13, s7 ; GFX900-NEXT: s_mov_b32 s14, s6 @@ -16823,10 +16989,11 @@ define void @s_shuffle_v4i64_v4i64__7_u_1_1() { ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 ; GFX90A-NEXT: s_mov_b32 s12, s6 ; GFX90A-NEXT: s_mov_b32 s13, s7 ; GFX90A-NEXT: s_mov_b32 s14, s6 @@ -16840,14 +17007,15 @@ define void @s_shuffle_v4i64_v4i64__7_u_1_1() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: s_mov_b32 s14, s2 ; GFX942-NEXT: s_mov_b32 s15, s3 @@ -17365,12 +17533,13 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_1() { ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s18 +; GFX900-NEXT: s_mov_b32 s11, s19 ; GFX900-NEXT: s_mov_b32 s14, s6 ; GFX900-NEXT: s_mov_b32 s15, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -17385,12 +17554,13 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_1() { ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s18 +; GFX90A-NEXT: s_mov_b32 s11, s19 ; GFX90A-NEXT: s_mov_b32 s14, s6 ; GFX90A-NEXT: s_mov_b32 s15, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -17402,14 +17572,17 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_1() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s14, s2 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s10, s18 +; GFX942-NEXT: s_mov_b32 s11, s19 +; GFX942-NEXT: s_mov_b32 s14, s2 ; GFX942-NEXT: s_mov_b32 s15, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] @@ -18688,15 +18861,18 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_2() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[20:27] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: s_mov_b32 s8, s26 +; GFX900-NEXT: s_mov_b32 s9, s27 +; GFX900-NEXT: s_mov_b32 s10, s26 +; GFX900-NEXT: s_mov_b32 s11, s27 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -18706,15 +18882,18 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_2() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[20:27] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: s_mov_b32 s8, s26 +; GFX90A-NEXT: s_mov_b32 s9, s27 +; GFX90A-NEXT: s_mov_b32 s10, s26 +; GFX90A-NEXT: s_mov_b32 s11, s27 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -18724,15 +18903,16 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_2() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s10, s18 +; GFX942-NEXT: s_mov_b32 s11, s19 ; GFX942-NEXT: s_mov_b32 s14, s4 ; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART @@ -20509,6 +20689,7 @@ define void @s_shuffle_v4i64_v4i64__1_4_4_4() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -20522,6 +20703,7 @@ define void @s_shuffle_v4i64_v4i64__1_4_4_4() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -20531,6 +20713,7 @@ define void @s_shuffle_v4i64_v4i64__1_4_4_4() { ; ; GFX942-LABEL: s_shuffle_v4i64_v4i64__1_4_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -20552,8 +20735,11 @@ define void @s_shuffle_v4i64_v4i64__2_4_4_4() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -20563,8 +20749,11 @@ define void @s_shuffle_v4i64_v4i64__2_4_4_4() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -20572,6 +20761,7 @@ define void @s_shuffle_v4i64_v4i64__2_4_4_4() { ; ; GFX942-LABEL: s_shuffle_v4i64_v4i64__2_4_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -20593,10 +20783,11 @@ define void @s_shuffle_v4i64_v4i64__3_4_4_4() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -20606,10 +20797,11 @@ define void @s_shuffle_v4i64_v4i64__3_4_4_4() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -20617,6 +20809,7 @@ define void @s_shuffle_v4i64_v4i64__3_4_4_4() { ; ; GFX942-LABEL: s_shuffle_v4i64_v4i64__3_4_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -20838,16 +21031,17 @@ define void @s_shuffle_v4i64_v4i64__7_4_4_4() { define void @s_shuffle_v4i64_v4i64__7_u_4_4() { ; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -20855,16 +21049,17 @@ define void @s_shuffle_v4i64_v4i64__7_u_4_4() { ; ; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -20872,6 +21067,7 @@ define void @s_shuffle_v4i64_v4i64__7_u_4_4() { ; ; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -21367,14 +21563,17 @@ define void @s_shuffle_v4i64_v4i64__7_7_4_4() { define void @s_shuffle_v4i64_v4i64__7_7_u_4() { ; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s10, s22 +; GFX900-NEXT: s_mov_b32 s11, s23 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -21382,14 +21581,17 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_4() { ; ; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s10, s22 +; GFX90A-NEXT: s_mov_b32 s11, s23 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -21397,6 +21599,7 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_4() { ; ; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -22199,16 +22402,17 @@ define void @s_shuffle_v4i64_v4i64__7_5_5_5() { define void @s_shuffle_v4i64_v4i64__7_u_5_5() { ; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_5_5: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -22216,16 +22420,17 @@ define void @s_shuffle_v4i64_v4i64__7_u_5_5() { ; ; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_5_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -22233,6 +22438,7 @@ define void @s_shuffle_v4i64_v4i64__7_u_5_5() { ; ; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_u_5_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -22708,14 +22914,17 @@ define void @s_shuffle_v4i64_v4i64__7_7_5_5() { define void @s_shuffle_v4i64_v4i64__7_7_u_5() { ; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_5: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s10, s22 +; GFX900-NEXT: s_mov_b32 s11, s23 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -22723,14 +22932,17 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_5() { ; ; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s10, s22 +; GFX90A-NEXT: s_mov_b32 s11, s23 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -22738,6 +22950,7 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_5() { ; ; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -23932,14 +24145,15 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_6() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s18 +; GFX900-NEXT: s_mov_b32 s11, s19 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -23949,14 +24163,15 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_6() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s18 +; GFX90A-NEXT: s_mov_b32 s11, s19 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -23964,6 +24179,7 @@ define void @s_shuffle_v4i64_v4i64__7_7_u_6() { ; ; GFX942-LABEL: s_shuffle_v4i64_v4i64__7_7_u_6: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll index b30af835a7882..0ef978a889880 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll @@ -57,40 +57,44 @@ define void @v_shuffle_v4p0_v2p0__0_u_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4p0_v2p0__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__1_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__1_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__1_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -113,40 +117,44 @@ define void @v_shuffle_v4p0_v2p0__2_u_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4p0_v2p0__3_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -433,18 +441,19 @@ define void @v_shuffle_v4p0_v2p0__3_3_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -452,18 +461,19 @@ define void @v_shuffle_v4p0_v2p0__3_3_1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 ; GFX90A-NEXT: v_mov_b32_e32 v4, v6 ; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -471,18 +481,19 @@ define void @v_shuffle_v4p0_v2p0__3_3_1_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:3] +; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:7] +; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: v_mov_b32_e32 v4, v6 +; GFX942-NEXT: v_mov_b32_e32 v8, v2 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 ; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -551,11 +562,14 @@ define void @v_shuffle_v4p0_v2p0__3_3_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -565,11 +579,14 @@ define void @v_shuffle_v4p0_v2p0__3_3_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -579,11 +596,14 @@ define void @v_shuffle_v4p0_v2p0__3_3_3_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -844,11 +864,14 @@ define void @v_shuffle_v4p0_v2p0__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -858,11 +881,14 @@ define void @v_shuffle_v4p0_v2p0__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -872,11 +898,14 @@ define void @v_shuffle_v4p0_v2p0__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -995,11 +1024,14 @@ define void @v_shuffle_v4p0_v2p0__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1009,11 +1041,14 @@ define void @v_shuffle_v4p0_v2p0__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1023,11 +1058,14 @@ define void @v_shuffle_v4p0_v2p0__2_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1124,6 +1162,7 @@ define void @v_shuffle_v4p0_v2p0__3_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -1144,6 +1183,7 @@ define void @v_shuffle_v4p0_v2p0__3_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -1165,6 +1205,7 @@ define void @v_shuffle_v4p0_v2p0__3_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -1399,14 +1440,14 @@ define void @v_shuffle_v4p0_v2p0__3_3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1419,14 +1460,14 @@ define void @v_shuffle_v4p0_v2p0__3_3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1436,18 +1477,17 @@ define void @v_shuffle_v4p0_v2p0__3_3_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -1871,10 +1911,11 @@ define void @v_shuffle_v4p0_v2p0__3_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 ; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -1891,10 +1932,11 @@ define void @v_shuffle_v4p0_v2p0__3_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -1911,10 +1953,11 @@ define void @v_shuffle_v4p0_v2p0__3_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -2374,40 +2417,44 @@ define void @v_shuffle_v4p0_v2p0__0_2_2_2(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4p0_v2p0__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__1_2_2_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__1_2_2_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__1_2_2_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2487,54 +2534,61 @@ define void @v_shuffle_v4p0_v2p0__3_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_u_2_2: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v0 ; GFX900-NEXT: v_mov_b32_e32 v5, v1 ; GFX900-NEXT: v_mov_b32_e32 v6, v0 ; GFX900-NEXT: v_mov_b32_e32 v7, v1 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_u_2_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: v_mov_b32_e32 v6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v2p0__3_u_2_2: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -2760,13 +2814,14 @@ define void @v_shuffle_v4p0_v2p0__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2776,13 +2831,14 @@ define void @v_shuffle_v4p0_v2p0__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2792,13 +2848,14 @@ define void @v_shuffle_v4p0_v2p0__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -3177,11 +3234,14 @@ define void @v_shuffle_v4p0_v2p0__3_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3191,11 +3251,14 @@ define void @v_shuffle_v4p0_v2p0__3_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3205,11 +3268,14 @@ define void @v_shuffle_v4p0_v2p0__3_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr> asm "; def $0", "=v"() @@ -3704,6 +3770,7 @@ define void @s_shuffle_v4p0_v2p0__0_u_u_u() { define void @s_shuffle_v4p0_v2p0__1_u_u_u() { ; GFX900-LABEL: s_shuffle_v4p0_v2p0__1_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -3717,6 +3784,7 @@ define void @s_shuffle_v4p0_v2p0__1_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p0_v2p0__1_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -3730,6 +3798,7 @@ define void @s_shuffle_v4p0_v2p0__1_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p0_v2p0__1_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -3763,6 +3832,7 @@ define void @s_shuffle_v4p0_v2p0__2_u_u_u() { define void @s_shuffle_v4p0_v2p0__3_u_u_u() { ; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -3776,6 +3846,7 @@ define void @s_shuffle_v4p0_v2p0__3_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -3789,6 +3860,7 @@ define void @s_shuffle_v4p0_v2p0__3_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -3809,15 +3881,16 @@ define void @s_shuffle_v4p0_v2p0__3_u_u_u() { define void @s_shuffle_v4p0_v2p0__3_0_u_u() { ; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_0_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[16:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -3827,15 +3900,16 @@ define void @s_shuffle_v4p0_v2p0__3_0_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_0_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[16:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -3845,6 +3919,7 @@ define void @s_shuffle_v4p0_v2p0__3_0_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_0_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -3925,6 +4000,7 @@ define void @s_shuffle_v4p0_v2p0__3_1_u_u() { define void @s_shuffle_v4p0_v2p0__3_2_u_u() { ; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_2_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -3940,6 +4016,7 @@ define void @s_shuffle_v4p0_v2p0__3_2_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -3955,6 +4032,7 @@ define void @s_shuffle_v4p0_v2p0__3_2_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -5415,6 +5493,7 @@ define void @s_shuffle_v4p0_v2p0__0_2_2_2() { define void @s_shuffle_v4p0_v2p0__1_2_2_2() { ; GFX900-LABEL: s_shuffle_v4p0_v2p0__1_2_2_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -5428,6 +5507,7 @@ define void @s_shuffle_v4p0_v2p0__1_2_2_2() { ; ; GFX90A-LABEL: s_shuffle_v4p0_v2p0__1_2_2_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -5441,6 +5521,7 @@ define void @s_shuffle_v4p0_v2p0__1_2_2_2() { ; ; GFX942-LABEL: s_shuffle_v4p0_v2p0__1_2_2_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -5636,6 +5717,7 @@ define void @s_shuffle_v4p0_v2p0__3_3_2_2() { define void @s_shuffle_v4p0_v2p0__3_3_u_2() { ; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -5653,6 +5735,7 @@ define void @s_shuffle_v4p0_v2p0__3_3_u_2() { ; ; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -5670,6 +5753,7 @@ define void @s_shuffle_v4p0_v2p0__3_3_u_2() { ; ; GFX942-LABEL: s_shuffle_v4p0_v2p0__3_3_u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll index e6ac554735eee..dfdb2fbbe19e1 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll @@ -18,37 +18,47 @@ define void @v_shuffle_v4p0_v3p0__u_u_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4p0_v3p0__0_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__0_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__0_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__0_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -103,6 +113,7 @@ define void @v_shuffle_v4p0_v3p0__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -116,6 +127,7 @@ define void @v_shuffle_v4p0_v3p0__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -129,6 +141,7 @@ define void @v_shuffle_v4p0_v3p0__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -199,6 +212,7 @@ define void @v_shuffle_v4p0_v3p0__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -212,6 +226,7 @@ define void @v_shuffle_v4p0_v3p0__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -225,6 +240,7 @@ define void @v_shuffle_v4p0_v3p0__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -682,6 +698,7 @@ define void @v_shuffle_v4p0_v3p0__5_5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -701,6 +718,7 @@ define void @v_shuffle_v4p0_v3p0__5_5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -720,6 +738,7 @@ define void @v_shuffle_v4p0_v3p0__5_5_2_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -847,6 +866,7 @@ define void @v_shuffle_v4p0_v3p0__5_5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -864,6 +884,7 @@ define void @v_shuffle_v4p0_v3p0__5_5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -881,6 +902,7 @@ define void @v_shuffle_v4p0_v3p0__5_5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -1280,7 +1302,10 @@ define void @v_shuffle_v4p0_v3p0__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1294,7 +1319,10 @@ define void @v_shuffle_v4p0_v3p0__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1308,7 +1336,10 @@ define void @v_shuffle_v4p0_v3p0__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1493,7 +1524,10 @@ define void @v_shuffle_v4p0_v3p0__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1507,7 +1541,10 @@ define void @v_shuffle_v4p0_v3p0__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1521,7 +1558,10 @@ define void @v_shuffle_v4p0_v3p0__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -1680,6 +1720,7 @@ define void @v_shuffle_v4p0_v3p0__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -1700,6 +1741,7 @@ define void @v_shuffle_v4p0_v3p0__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -1721,6 +1763,7 @@ define void @v_shuffle_v4p0_v3p0__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -2088,12 +2131,14 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -2107,12 +2152,14 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v6 ; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -2127,12 +2174,14 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mov_b32_e32 v4, v6 ; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -2812,9 +2861,10 @@ define void @v_shuffle_v4p0_v3p0__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v0, v8 ; GFX900-NEXT: v_mov_b32_e32 v1, v9 -; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -2831,9 +2881,10 @@ define void @v_shuffle_v4p0_v3p0__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v0, v8 ; GFX90A-NEXT: v_mov_b32_e32 v1, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -2851,9 +2902,10 @@ define void @v_shuffle_v4p0_v3p0__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v4, v2 ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v0, v8 ; GFX942-NEXT: v_mov_b32_e32 v1, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -3907,14 +3959,15 @@ define void @v_shuffle_v4p0_v3p0__5_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:11] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 ; GFX900-NEXT: v_mov_b32_e32 v0, v10 ; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -3926,14 +3979,15 @@ define void @v_shuffle_v4p0_v3p0__5_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 ; GFX90A-NEXT: v_mov_b32_e32 v0, v10 ; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3945,14 +3999,16 @@ define void @v_shuffle_v4p0_v3p0__5_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[6:11] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v10 ; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -4630,37 +4686,47 @@ define void @v_shuffle_v4p0_v3p0__u_3_3_3(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4p0_v3p0__0_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__0_3_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__0_3_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p0_v3p0__0_3_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v6, v4 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -4715,6 +4781,7 @@ define void @v_shuffle_v4p0_v3p0__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -4728,6 +4795,7 @@ define void @v_shuffle_v4p0_v3p0__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -4741,6 +4809,7 @@ define void @v_shuffle_v4p0_v3p0__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -4894,6 +4963,7 @@ define void @v_shuffle_v4p0_v3p0__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -4911,6 +4981,7 @@ define void @v_shuffle_v4p0_v3p0__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -4928,6 +4999,7 @@ define void @v_shuffle_v4p0_v3p0__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -5270,14 +5342,14 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5287,14 +5359,14 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5304,14 +5376,14 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -5978,6 +6050,7 @@ define void @v_shuffle_v4p0_v3p0__5_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v0, v2 ; GFX900-NEXT: v_mov_b32_e32 v1, v3 ; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 @@ -5995,6 +6068,7 @@ define void @v_shuffle_v4p0_v3p0__5_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 @@ -6012,6 +6086,7 @@ define void @v_shuffle_v4p0_v3p0__5_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v1, v3 ; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 @@ -7016,9 +7091,10 @@ define void @v_shuffle_v4p0_v3p0__5_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v0, v4 ; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -7032,9 +7108,10 @@ define void @v_shuffle_v4p0_v3p0__5_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -7048,9 +7125,10 @@ define void @v_shuffle_v4p0_v3p0__5_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 ; GFX942-NEXT: v_mov_b32_e32 v1, v5 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -7792,6 +7870,7 @@ define void @s_shuffle_v4p0_v3p0__1_u_u_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -7805,6 +7884,7 @@ define void @s_shuffle_v4p0_v3p0__1_u_u_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -7814,6 +7894,7 @@ define void @s_shuffle_v4p0_v3p0__1_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p0_v3p0__1_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -7835,8 +7916,11 @@ define void @s_shuffle_v4p0_v3p0__2_u_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -7846,8 +7930,11 @@ define void @s_shuffle_v4p0_v3p0__2_u_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -7855,6 +7942,7 @@ define void @s_shuffle_v4p0_v3p0__2_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p0_v3p0__2_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -7892,6 +7980,7 @@ define void @s_shuffle_v4p0_v3p0__4_u_u_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -7905,6 +7994,7 @@ define void @s_shuffle_v4p0_v3p0__4_u_u_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -7914,6 +8004,7 @@ define void @s_shuffle_v4p0_v3p0__4_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p0_v3p0__4_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -7936,8 +8027,11 @@ define void @s_shuffle_v4p0_v3p0__5_u_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -7947,8 +8041,11 @@ define void @s_shuffle_v4p0_v3p0__5_u_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -7956,6 +8053,7 @@ define void @s_shuffle_v4p0_v3p0__5_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -7981,10 +8079,11 @@ define void @s_shuffle_v4p0_v3p0__5_0_u_u() { ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -7999,10 +8098,11 @@ define void @s_shuffle_v4p0_v3p0__5_0_u_u() { ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -8014,12 +8114,15 @@ define void @s_shuffle_v4p0_v3p0__5_0_u_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s10, s0 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s16 +; GFX942-NEXT: s_mov_b32 s9, s17 +; GFX942-NEXT: s_mov_b32 s10, s0 ; GFX942-NEXT: s_mov_b32 s11, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] @@ -8088,13 +8191,16 @@ define void @s_shuffle_v4p0_v3p0__5_2_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[20:25] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s8, s24 +; GFX900-NEXT: s_mov_b32 s9, s25 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -8104,13 +8210,16 @@ define void @s_shuffle_v4p0_v3p0__5_2_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[20:25] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s8, s24 +; GFX90A-NEXT: s_mov_b32 s9, s25 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -8120,13 +8229,14 @@ define void @s_shuffle_v4p0_v3p0__5_2_u_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 +; GFX942-NEXT: s_mov_b32 s8, s16 +; GFX942-NEXT: s_mov_b32 s9, s17 ; GFX942-NEXT: s_mov_b32 s10, s4 ; GFX942-NEXT: s_mov_b32 s11, s5 ; GFX942-NEXT: ;;#ASMSTART @@ -8143,12 +8253,15 @@ define void @s_shuffle_v4p0_v3p0__5_2_u_u() { define void @s_shuffle_v4p0_v3p0__5_3_u_u() { ; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_3_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[16:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s8, s20 +; GFX900-NEXT: s_mov_b32 s9, s21 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -8156,12 +8269,15 @@ define void @s_shuffle_v4p0_v3p0__5_3_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_3_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[16:21] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s8, s20 +; GFX90A-NEXT: s_mov_b32 s9, s21 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -8169,6 +8285,7 @@ define void @s_shuffle_v4p0_v3p0__5_3_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_3_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -8213,12 +8330,13 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -8228,12 +8346,13 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -8241,6 +8360,7 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -8268,12 +8388,13 @@ define void @s_shuffle_v4p0_v3p0__5_5_0_u() { ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: s_mov_b32 s12, s4 ; GFX900-NEXT: s_mov_b32 s13, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -8288,12 +8409,13 @@ define void @s_shuffle_v4p0_v3p0__5_5_0_u() { ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: s_mov_b32 s12, s4 ; GFX90A-NEXT: s_mov_b32 s13, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -8305,15 +8427,16 @@ define void @s_shuffle_v4p0_v3p0__5_5_0_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s8, s16 +; GFX942-NEXT: s_mov_b32 s9, s17 +; GFX942-NEXT: s_mov_b32 s10, s16 +; GFX942-NEXT: s_mov_b32 s11, s17 ; GFX942-NEXT: s_mov_b32 s12, s0 ; GFX942-NEXT: s_mov_b32 s13, s1 ; GFX942-NEXT: ;;#ASMSTART @@ -8335,12 +8458,13 @@ define void @s_shuffle_v4p0_v3p0__5_5_1_u() { ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: s_mov_b32 s12, s6 ; GFX900-NEXT: s_mov_b32 s13, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -8355,12 +8479,13 @@ define void @s_shuffle_v4p0_v3p0__5_5_1_u() { ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: s_mov_b32 s12, s6 ; GFX90A-NEXT: s_mov_b32 s13, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -8372,15 +8497,16 @@ define void @s_shuffle_v4p0_v3p0__5_5_1_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s8, s16 +; GFX942-NEXT: s_mov_b32 s9, s17 +; GFX942-NEXT: s_mov_b32 s10, s16 +; GFX942-NEXT: s_mov_b32 s11, s17 ; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART @@ -8458,14 +8584,17 @@ define void @s_shuffle_v4p0_v3p0__5_5_2_u() { define void @s_shuffle_v4p0_v3p0__5_5_3_u() { ; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[16:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: s_mov_b32 s8, s20 +; GFX900-NEXT: s_mov_b32 s9, s21 +; GFX900-NEXT: s_mov_b32 s10, s20 +; GFX900-NEXT: s_mov_b32 s11, s21 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -8473,14 +8602,17 @@ define void @s_shuffle_v4p0_v3p0__5_5_3_u() { ; ; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[16:21] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s8, s20 +; GFX90A-NEXT: s_mov_b32 s9, s21 +; GFX90A-NEXT: s_mov_b32 s10, s20 +; GFX90A-NEXT: s_mov_b32 s11, s21 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -8488,6 +8620,7 @@ define void @s_shuffle_v4p0_v3p0__5_5_3_u() { ; ; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -8512,16 +8645,17 @@ define void @s_shuffle_v4p0_v3p0__5_5_3_u() { define void @s_shuffle_v4p0_v3p0__5_5_4_u() { ; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_4_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[16:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s8, s20 +; GFX900-NEXT: s_mov_b32 s9, s21 +; GFX900-NEXT: s_mov_b32 s10, s20 +; GFX900-NEXT: s_mov_b32 s11, s21 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -8529,16 +8663,17 @@ define void @s_shuffle_v4p0_v3p0__5_5_4_u() { ; ; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_4_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[16:21] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s8, s20 +; GFX90A-NEXT: s_mov_b32 s9, s21 +; GFX90A-NEXT: s_mov_b32 s10, s20 +; GFX90A-NEXT: s_mov_b32 s11, s21 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -8546,6 +8681,7 @@ define void @s_shuffle_v4p0_v3p0__5_5_4_u() { ; ; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_4_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -8945,6 +9081,7 @@ define void @s_shuffle_v4p0_v3p0__u_0_0_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: s_mov_b32 s12, s4 @@ -8962,6 +9099,7 @@ define void @s_shuffle_v4p0_v3p0__u_0_0_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: s_mov_b32 s12, s4 @@ -8975,6 +9113,7 @@ define void @s_shuffle_v4p0_v3p0__u_0_0_0() { ; ; GFX942-LABEL: s_shuffle_v4p0_v3p0__u_0_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -9147,6 +9286,7 @@ define void @s_shuffle_v4p0_v3p0__3_0_0_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: s_mov_b32 s12, s4 @@ -9164,6 +9304,7 @@ define void @s_shuffle_v4p0_v3p0__3_0_0_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: s_mov_b32 s12, s4 @@ -9177,6 +9318,7 @@ define void @s_shuffle_v4p0_v3p0__3_0_0_0() { ; ; GFX942-LABEL: s_shuffle_v4p0_v3p0__3_0_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -9349,10 +9491,11 @@ define void @s_shuffle_v4p0_v3p0__5_u_0_0() { ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: s_mov_b32 s12, s4 ; GFX900-NEXT: s_mov_b32 s13, s5 ; GFX900-NEXT: s_mov_b32 s14, s4 @@ -9369,10 +9512,11 @@ define void @s_shuffle_v4p0_v3p0__5_u_0_0() { ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: s_mov_b32 s12, s4 ; GFX90A-NEXT: s_mov_b32 s13, s5 ; GFX90A-NEXT: s_mov_b32 s14, s4 @@ -9386,12 +9530,15 @@ define void @s_shuffle_v4p0_v3p0__5_u_0_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s16 +; GFX942-NEXT: s_mov_b32 s9, s17 +; GFX942-NEXT: s_mov_b32 s12, s0 ; GFX942-NEXT: s_mov_b32 s13, s1 ; GFX942-NEXT: s_mov_b32 s14, s0 ; GFX942-NEXT: s_mov_b32 s15, s1 @@ -9765,12 +9912,13 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_0() { ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: s_mov_b32 s14, s4 ; GFX900-NEXT: s_mov_b32 s15, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -9785,12 +9933,13 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_0() { ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: s_mov_b32 s14, s4 ; GFX90A-NEXT: s_mov_b32 s15, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -9802,15 +9951,16 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s8, s16 +; GFX942-NEXT: s_mov_b32 s9, s17 +; GFX942-NEXT: s_mov_b32 s10, s16 +; GFX942-NEXT: s_mov_b32 s11, s17 ; GFX942-NEXT: s_mov_b32 s14, s0 ; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART @@ -10357,10 +10507,11 @@ define void @s_shuffle_v4p0_v3p0__5_u_1_1() { ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: s_mov_b32 s12, s6 ; GFX900-NEXT: s_mov_b32 s13, s7 ; GFX900-NEXT: s_mov_b32 s14, s6 @@ -10377,10 +10528,11 @@ define void @s_shuffle_v4p0_v3p0__5_u_1_1() { ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: s_mov_b32 s12, s6 ; GFX90A-NEXT: s_mov_b32 s13, s7 ; GFX90A-NEXT: s_mov_b32 s14, s6 @@ -10394,12 +10546,15 @@ define void @s_shuffle_v4p0_v3p0__5_u_1_1() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:5] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:9] +; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s16 +; GFX942-NEXT: s_mov_b32 s9, s17 +; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: s_mov_b32 s14, s2 ; GFX942-NEXT: s_mov_b32 s15, s3 @@ -10769,12 +10924,13 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_1() { ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: s_mov_b32 s14, s6 ; GFX900-NEXT: s_mov_b32 s15, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -10789,12 +10945,13 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_1() { ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: s_mov_b32 s14, s6 ; GFX90A-NEXT: s_mov_b32 s15, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -10806,15 +10963,16 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_1() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s8, s16 +; GFX942-NEXT: s_mov_b32 s9, s17 +; GFX942-NEXT: s_mov_b32 s10, s16 +; GFX942-NEXT: s_mov_b32 s11, s17 ; GFX942-NEXT: s_mov_b32 s14, s2 ; GFX942-NEXT: s_mov_b32 s15, s3 ; GFX942-NEXT: ;;#ASMSTART @@ -11734,17 +11892,18 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_2() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ; def s[20:25] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s20 -; GFX900-NEXT: s_mov_b32 s9, s21 -; GFX900-NEXT: s_mov_b32 s10, s20 -; GFX900-NEXT: s_mov_b32 s11, s21 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: s_mov_b32 s8, s24 +; GFX900-NEXT: s_mov_b32 s9, s25 +; GFX900-NEXT: s_mov_b32 s10, s24 +; GFX900-NEXT: s_mov_b32 s11, s25 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -11754,17 +11913,18 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_2() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ; def s[20:25] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s20 -; GFX90A-NEXT: s_mov_b32 s9, s21 -; GFX90A-NEXT: s_mov_b32 s10, s20 -; GFX90A-NEXT: s_mov_b32 s11, s21 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: s_mov_b32 s8, s24 +; GFX90A-NEXT: s_mov_b32 s9, s25 +; GFX90A-NEXT: s_mov_b32 s10, s24 +; GFX90A-NEXT: s_mov_b32 s11, s25 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -11774,15 +11934,16 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_2() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:13] +; GFX942-NEXT: ; def s[12:17] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s12 -; GFX942-NEXT: s_mov_b32 s9, s13 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: s_mov_b32 s8, s16 +; GFX942-NEXT: s_mov_b32 s9, s17 +; GFX942-NEXT: s_mov_b32 s10, s16 +; GFX942-NEXT: s_mov_b32 s11, s17 ; GFX942-NEXT: s_mov_b32 s14, s4 ; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART @@ -12143,6 +12304,7 @@ define void @s_shuffle_v4p0_v3p0__1_3_3_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:9] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -12156,6 +12318,7 @@ define void @s_shuffle_v4p0_v3p0__1_3_3_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -12165,6 +12328,7 @@ define void @s_shuffle_v4p0_v3p0__1_3_3_3() { ; ; GFX942-LABEL: s_shuffle_v4p0_v3p0__1_3_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -12186,8 +12350,11 @@ define void @s_shuffle_v4p0_v3p0__2_3_3_3() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -12197,8 +12364,11 @@ define void @s_shuffle_v4p0_v3p0__2_3_3_3() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -12206,6 +12376,7 @@ define void @s_shuffle_v4p0_v3p0__2_3_3_3() { ; ; GFX942-LABEL: s_shuffle_v4p0_v3p0__2_3_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -12363,14 +12534,17 @@ define void @s_shuffle_v4p0_v3p0__5_3_3_3() { define void @s_shuffle_v4p0_v3p0__5_u_3_3() { ; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_u_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[16:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s8, s20 +; GFX900-NEXT: s_mov_b32 s9, s21 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -12378,14 +12552,17 @@ define void @s_shuffle_v4p0_v3p0__5_u_3_3() { ; ; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_u_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[16:21] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s8, s20 +; GFX90A-NEXT: s_mov_b32 s9, s21 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -12393,6 +12570,7 @@ define void @s_shuffle_v4p0_v3p0__5_u_3_3() { ; ; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_u_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -12744,16 +12922,17 @@ define void @s_shuffle_v4p0_v3p0__5_5_3_3() { define void @s_shuffle_v4p0_v3p0__5_5_u_3() { ; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_u_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[16:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: s_mov_b32 s8, s20 +; GFX900-NEXT: s_mov_b32 s9, s21 +; GFX900-NEXT: s_mov_b32 s10, s20 +; GFX900-NEXT: s_mov_b32 s11, s21 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -12761,16 +12940,17 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_3() { ; ; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_u_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[16:21] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: s_mov_b32 s8, s20 +; GFX90A-NEXT: s_mov_b32 s9, s21 +; GFX90A-NEXT: s_mov_b32 s10, s20 +; GFX90A-NEXT: s_mov_b32 s11, s21 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -12778,6 +12958,7 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_3() { ; ; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_u_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -13368,14 +13549,17 @@ define void @s_shuffle_v4p0_v3p0__5_4_4_4() { define void @s_shuffle_v4p0_v3p0__5_u_4_4() { ; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_u_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ; def s[16:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: s_mov_b32 s8, s20 +; GFX900-NEXT: s_mov_b32 s9, s21 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -13383,14 +13567,17 @@ define void @s_shuffle_v4p0_v3p0__5_u_4_4() { ; ; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_u_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ; def s[16:21] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: s_mov_b32 s8, s20 +; GFX90A-NEXT: s_mov_b32 s9, s21 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -13398,6 +13585,7 @@ define void @s_shuffle_v4p0_v3p0__5_u_4_4() { ; ; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_u_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -13739,14 +13927,17 @@ define void @s_shuffle_v4p0_v3p0__5_5_4_4() { define void @s_shuffle_v4p0_v3p0__5_5_u_4() { ; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_u_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ; def s[16:21] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s16 -; GFX900-NEXT: s_mov_b32 s9, s17 -; GFX900-NEXT: s_mov_b32 s10, s16 -; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: s_mov_b32 s8, s20 +; GFX900-NEXT: s_mov_b32 s9, s21 +; GFX900-NEXT: s_mov_b32 s10, s20 +; GFX900-NEXT: s_mov_b32 s11, s21 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -13754,14 +13945,17 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_4() { ; ; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ; def s[16:21] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s16 -; GFX90A-NEXT: s_mov_b32 s9, s17 -; GFX90A-NEXT: s_mov_b32 s10, s16 -; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s8, s20 +; GFX90A-NEXT: s_mov_b32 s9, s21 +; GFX90A-NEXT: s_mov_b32 s10, s20 +; GFX90A-NEXT: s_mov_b32 s11, s21 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -13769,6 +13963,7 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_4() { ; ; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] @@ -14630,14 +14825,15 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_5() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ; def s[12:17] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s12 -; GFX900-NEXT: s_mov_b32 s9, s13 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -14647,14 +14843,15 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_5() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ; def s[12:17] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s12 -; GFX90A-NEXT: s_mov_b32 s9, s13 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -14662,6 +14859,7 @@ define void @s_shuffle_v4p0_v3p0__5_5_u_5() { ; ; GFX942-LABEL: s_shuffle_v4p0_v3p0__5_5_u_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:5] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll index ce1c54129f706..ae949dc1cb076 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll @@ -142,6 +142,7 @@ define void @v_shuffle_v4p0_v4p0__3_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -155,6 +156,7 @@ define void @v_shuffle_v4p0_v4p0__3_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -168,6 +170,7 @@ define void @v_shuffle_v4p0_v4p0__3_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -278,6 +281,7 @@ define void @v_shuffle_v4p0_v4p0__7_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -291,6 +295,7 @@ define void @v_shuffle_v4p0_v4p0__7_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -304,6 +309,7 @@ define void @v_shuffle_v4p0_v4p0__7_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -918,16 +924,17 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: v_mov_b32_e32 v8, v6 ; GFX900-NEXT: v_mov_b32_e32 v9, v7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v10, v[8:11], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -940,6 +947,7 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -959,6 +967,7 @@ define void @v_shuffle_v4p0_v4p0__7_7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[8:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -1135,6 +1144,7 @@ define void @v_shuffle_v4p0_v4p0__7_7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -1151,6 +1161,7 @@ define void @v_shuffle_v4p0_v4p0__7_7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -1167,6 +1178,7 @@ define void @v_shuffle_v4p0_v4p0__7_7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -1685,7 +1697,10 @@ define void @v_shuffle_v4p0_v4p0__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1699,7 +1714,10 @@ define void @v_shuffle_v4p0_v4p0__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1713,7 +1731,10 @@ define void @v_shuffle_v4p0_v4p0__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -1949,7 +1970,10 @@ define void @v_shuffle_v4p0_v4p0__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1963,7 +1987,10 @@ define void @v_shuffle_v4p0_v4p0__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1977,7 +2004,10 @@ define void @v_shuffle_v4p0_v4p0__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr> asm "; def $0", "=v"() @@ -2201,6 +2231,7 @@ define void @v_shuffle_v4p0_v4p0__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v8 ; GFX900-NEXT: v_mov_b32_e32 v1, v9 @@ -2221,6 +2252,7 @@ define void @v_shuffle_v4p0_v4p0__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v8 ; GFX90A-NEXT: v_mov_b32_e32 v1, v9 @@ -2242,6 +2274,7 @@ define void @v_shuffle_v4p0_v4p0__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mov_b32_e32 v0, v8 ; GFX942-NEXT: v_mov_b32_e32 v1, v9 @@ -2736,16 +2769,17 @@ define void @v_shuffle_v4p0_v4p0__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, v0 -; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2758,12 +2792,13 @@ define void @v_shuffle_v4p0_v4p0__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: v_mov_b32_e32 v6, v8 ; GFX90A-NEXT: v_mov_b32_e32 v7, v9 -; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -2778,12 +2813,13 @@ define void @v_shuffle_v4p0_v4p0__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:9] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-NEXT: v_mov_b32_e32 v6, v8 ; GFX942-NEXT: v_mov_b32_e32 v7, v9 -; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -3706,9 +3742,10 @@ define void @v_shuffle_v4p0_v4p0__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v0, v10 ; GFX900-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -3725,9 +3762,10 @@ define void @v_shuffle_v4p0_v4p0__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v0, v10 ; GFX90A-NEXT: v_mov_b32_e32 v1, v11 -; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3745,9 +3783,10 @@ define void @v_shuffle_v4p0_v4p0__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v4, v2 ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v0, v10 ; GFX942-NEXT: v_mov_b32_e32 v1, v11 -; GFX942-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -5184,6 +5223,7 @@ define void @v_shuffle_v4p0_v4p0__7_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[6:13] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v14, 0 ; GFX900-NEXT: v_mov_b32_e32 v6, v4 ; GFX900-NEXT: v_mov_b32_e32 v7, v5 @@ -5203,6 +5243,7 @@ define void @v_shuffle_v4p0_v4p0__7_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v14, 0 ; GFX90A-NEXT: v_mov_b32_e32 v6, v4 ; GFX90A-NEXT: v_mov_b32_e32 v7, v5 @@ -5219,6 +5260,7 @@ define void @v_shuffle_v4p0_v4p0__7_u_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[6:13] @@ -6654,6 +6696,7 @@ define void @v_shuffle_v4p0_v4p0__7_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v16, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[8:15] @@ -6673,6 +6716,7 @@ define void @v_shuffle_v4p0_v4p0__7_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v16, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] @@ -6692,6 +6736,7 @@ define void @v_shuffle_v4p0_v4p0__7_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[8:15] @@ -7758,6 +7803,7 @@ define void @v_shuffle_v4p0_v4p0__3_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -7771,6 +7817,7 @@ define void @v_shuffle_v4p0_v4p0__3_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -7784,6 +7831,7 @@ define void @v_shuffle_v4p0_v4p0__3_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -7989,6 +8037,7 @@ define void @v_shuffle_v4p0_v4p0__7_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 ; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 @@ -8006,6 +8055,7 @@ define void @v_shuffle_v4p0_v4p0__7_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 @@ -8023,6 +8073,7 @@ define void @v_shuffle_v4p0_v4p0__7_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 @@ -8484,12 +8535,14 @@ define void @v_shuffle_v4p0_v4p0__7_7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -8500,12 +8553,14 @@ define void @v_shuffle_v4p0_v4p0__7_7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v6 ; GFX90A-NEXT: v_mov_b32_e32 v5, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -8516,12 +8571,14 @@ define void @v_shuffle_v4p0_v4p0__7_7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mov_b32_e32 v4, v6 ; GFX942-NEXT: v_mov_b32_e32 v5, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -9424,9 +9481,10 @@ define void @v_shuffle_v4p0_v4p0__7_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -9440,9 +9498,10 @@ define void @v_shuffle_v4p0_v4p0__7_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -9456,9 +9515,10 @@ define void @v_shuffle_v4p0_v4p0__7_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v4, v2 ; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -10855,9 +10915,10 @@ define void @v_shuffle_v4p0_v4p0__7_u_6_6(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v0, v6 ; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -10871,9 +10932,10 @@ define void @v_shuffle_v4p0_v4p0__7_u_6_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10887,9 +10949,10 @@ define void @v_shuffle_v4p0_v4p0__7_u_6_6(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -12247,6 +12310,7 @@ define void @v_shuffle_v4p0_v4p0__7_u_7_7(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:7] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: v_mov_b32_e32 v4, v6 ; GFX900-NEXT: v_mov_b32_e32 v5, v7 @@ -12263,6 +12327,7 @@ define void @v_shuffle_v4p0_v4p0__7_u_7_7(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: v_mov_b32_e32 v4, v6 ; GFX90A-NEXT: v_mov_b32_e32 v5, v7 @@ -12279,6 +12344,7 @@ define void @v_shuffle_v4p0_v4p0__7_u_7_7(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:7] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: v_mov_b32_e32 v4, v6 ; GFX942-NEXT: v_mov_b32_e32 v5, v7 @@ -13263,6 +13329,7 @@ define void @s_shuffle_v4p0_v4p0__1_u_u_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -13276,6 +13343,7 @@ define void @s_shuffle_v4p0_v4p0__1_u_u_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -13285,6 +13353,7 @@ define void @s_shuffle_v4p0_v4p0__1_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p0_v4p0__1_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13306,8 +13375,11 @@ define void @s_shuffle_v4p0_v4p0__2_u_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -13317,8 +13389,11 @@ define void @s_shuffle_v4p0_v4p0__2_u_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -13326,6 +13401,7 @@ define void @s_shuffle_v4p0_v4p0__2_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p0_v4p0__2_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13347,10 +13423,11 @@ define void @s_shuffle_v4p0_v4p0__3_u_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -13360,10 +13437,11 @@ define void @s_shuffle_v4p0_v4p0__3_u_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -13371,6 +13449,7 @@ define void @s_shuffle_v4p0_v4p0__3_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p0_v4p0__3_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13408,6 +13487,7 @@ define void @s_shuffle_v4p0_v4p0__5_u_u_u() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -13421,6 +13501,7 @@ define void @s_shuffle_v4p0_v4p0__5_u_u_u() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -13430,6 +13511,7 @@ define void @s_shuffle_v4p0_v4p0__5_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p0_v4p0__5_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13452,8 +13534,11 @@ define void @s_shuffle_v4p0_v4p0__6_u_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -13463,8 +13548,11 @@ define void @s_shuffle_v4p0_v4p0__6_u_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -13472,6 +13560,7 @@ define void @s_shuffle_v4p0_v4p0__6_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p0_v4p0__6_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13494,10 +13583,11 @@ define void @s_shuffle_v4p0_v4p0__7_u_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -13507,10 +13597,11 @@ define void @s_shuffle_v4p0_v4p0__7_u_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -13518,6 +13609,7 @@ define void @s_shuffle_v4p0_v4p0__7_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13543,10 +13635,11 @@ define void @s_shuffle_v4p0_v4p0__7_0_u_u() { ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -13561,10 +13654,11 @@ define void @s_shuffle_v4p0_v4p0__7_0_u_u() { ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -13576,14 +13670,14 @@ define void @s_shuffle_v4p0_v4p0__7_0_u_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 ; GFX942-NEXT: s_mov_b32 s10, s0 ; GFX942-NEXT: s_mov_b32 s11, s1 ; GFX942-NEXT: ;;#ASMSTART @@ -13657,15 +13751,16 @@ define void @s_shuffle_v4p0_v4p0__7_2_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[20:27] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s8, s26 +; GFX900-NEXT: s_mov_b32 s9, s27 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -13675,15 +13770,16 @@ define void @s_shuffle_v4p0_v4p0__7_2_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[20:27] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s8, s26 +; GFX90A-NEXT: s_mov_b32 s9, s27 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -13693,13 +13789,14 @@ define void @s_shuffle_v4p0_v4p0__7_2_u_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 ; GFX942-NEXT: s_mov_b32 s10, s4 ; GFX942-NEXT: s_mov_b32 s11, s5 ; GFX942-NEXT: ;;#ASMSTART @@ -13718,13 +13815,16 @@ define void @s_shuffle_v4p0_v4p0__7_3_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ; def s[20:27] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s18 -; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s8, s26 +; GFX900-NEXT: s_mov_b32 s9, s27 +; GFX900-NEXT: s_mov_b32 s10, s18 +; GFX900-NEXT: s_mov_b32 s11, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -13734,13 +13834,16 @@ define void @s_shuffle_v4p0_v4p0__7_3_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ; def s[20:27] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s18 -; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s8, s26 +; GFX90A-NEXT: s_mov_b32 s9, s27 +; GFX90A-NEXT: s_mov_b32 s10, s18 +; GFX90A-NEXT: s_mov_b32 s11, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -13750,13 +13853,14 @@ define void @s_shuffle_v4p0_v4p0__7_3_u_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 ; GFX942-NEXT: s_mov_b32 s10, s6 ; GFX942-NEXT: s_mov_b32 s11, s7 ; GFX942-NEXT: ;;#ASMSTART @@ -13773,14 +13877,15 @@ define void @s_shuffle_v4p0_v4p0__7_3_u_u() { define void @s_shuffle_v4p0_v4p0__7_4_u_u() { ; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s10, s4 -; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -13788,14 +13893,15 @@ define void @s_shuffle_v4p0_v4p0__7_4_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s10, s4 -; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -13803,6 +13909,7 @@ define void @s_shuffle_v4p0_v4p0__7_4_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_4_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13847,12 +13954,13 @@ define void @s_shuffle_v4p0_v4p0__7_6_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s12 -; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -13862,12 +13970,13 @@ define void @s_shuffle_v4p0_v4p0__7_6_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s12 -; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -13875,6 +13984,7 @@ define void @s_shuffle_v4p0_v4p0__7_6_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_6_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13899,10 +14009,13 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s18 +; GFX900-NEXT: s_mov_b32 s11, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -13912,10 +14025,13 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s18 +; GFX90A-NEXT: s_mov_b32 s11, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -13923,6 +14039,7 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -13950,12 +14067,13 @@ define void @s_shuffle_v4p0_v4p0__7_7_0_u() { ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s18 +; GFX900-NEXT: s_mov_b32 s11, s19 ; GFX900-NEXT: s_mov_b32 s12, s4 ; GFX900-NEXT: s_mov_b32 s13, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -13970,12 +14088,13 @@ define void @s_shuffle_v4p0_v4p0__7_7_0_u() { ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s18 +; GFX90A-NEXT: s_mov_b32 s11, s19 ; GFX90A-NEXT: s_mov_b32 s12, s4 ; GFX90A-NEXT: s_mov_b32 s13, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -13987,14 +14106,17 @@ define void @s_shuffle_v4p0_v4p0__7_7_0_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s10, s18 +; GFX942-NEXT: s_mov_b32 s11, s19 +; GFX942-NEXT: s_mov_b32 s12, s0 ; GFX942-NEXT: s_mov_b32 s13, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] @@ -14015,12 +14137,13 @@ define void @s_shuffle_v4p0_v4p0__7_7_1_u() { ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s18 +; GFX900-NEXT: s_mov_b32 s11, s19 ; GFX900-NEXT: s_mov_b32 s12, s6 ; GFX900-NEXT: s_mov_b32 s13, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -14035,12 +14158,13 @@ define void @s_shuffle_v4p0_v4p0__7_7_1_u() { ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s18 +; GFX90A-NEXT: s_mov_b32 s11, s19 ; GFX90A-NEXT: s_mov_b32 s12, s6 ; GFX90A-NEXT: s_mov_b32 s13, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -14052,14 +14176,17 @@ define void @s_shuffle_v4p0_v4p0__7_7_1_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s10, s18 +; GFX942-NEXT: s_mov_b32 s11, s19 +; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] @@ -14134,15 +14261,18 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[20:27] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s8, s26 +; GFX900-NEXT: s_mov_b32 s9, s27 +; GFX900-NEXT: s_mov_b32 s10, s26 +; GFX900-NEXT: s_mov_b32 s11, s27 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -14152,15 +14282,18 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[20:27] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s8, s26 +; GFX90A-NEXT: s_mov_b32 s9, s27 +; GFX90A-NEXT: s_mov_b32 s10, s26 +; GFX90A-NEXT: s_mov_b32 s11, s27 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -14170,15 +14303,16 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_u() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s10, s18 +; GFX942-NEXT: s_mov_b32 s11, s19 ; GFX942-NEXT: s_mov_b32 s12, s6 ; GFX942-NEXT: s_mov_b32 s13, s7 ; GFX942-NEXT: ;;#ASMSTART @@ -14195,14 +14329,17 @@ define void @s_shuffle_v4p0_v4p0__7_7_3_u() { define void @s_shuffle_v4p0_v4p0__7_7_4_u() { ; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s10, s22 +; GFX900-NEXT: s_mov_b32 s11, s23 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -14210,14 +14347,17 @@ define void @s_shuffle_v4p0_v4p0__7_7_4_u() { ; ; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s10, s22 +; GFX90A-NEXT: s_mov_b32 s11, s23 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -14225,6 +14365,7 @@ define void @s_shuffle_v4p0_v4p0__7_7_4_u() { ; ; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_4_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -14249,14 +14390,17 @@ define void @s_shuffle_v4p0_v4p0__7_7_4_u() { define void @s_shuffle_v4p0_v4p0__7_7_5_u() { ; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s10, s22 +; GFX900-NEXT: s_mov_b32 s11, s23 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -14264,14 +14408,17 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_u() { ; ; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s10, s22 +; GFX90A-NEXT: s_mov_b32 s11, s23 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -14279,6 +14426,7 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_u() { ; ; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_5_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -14327,14 +14475,15 @@ define void @s_shuffle_v4p0_v4p0__7_7_7_u() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s18 +; GFX900-NEXT: s_mov_b32 s11, s19 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -14344,14 +14493,15 @@ define void @s_shuffle_v4p0_v4p0__7_7_7_u() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s12, s14 -; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s18 +; GFX90A-NEXT: s_mov_b32 s11, s19 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -14359,6 +14509,7 @@ define void @s_shuffle_v4p0_v4p0__7_7_7_u() { ; ; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_7_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -14885,6 +15036,7 @@ define void @s_shuffle_v4p0_v4p0__u_0_0_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: s_mov_b32 s12, s4 @@ -14902,6 +15054,7 @@ define void @s_shuffle_v4p0_v4p0__u_0_0_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: s_mov_b32 s12, s4 @@ -14915,6 +15068,7 @@ define void @s_shuffle_v4p0_v4p0__u_0_0_0() { ; ; GFX942-LABEL: s_shuffle_v4p0_v4p0__u_0_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -15150,6 +15304,7 @@ define void @s_shuffle_v4p0_v4p0__4_0_0_0() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: s_mov_b32 s12, s4 @@ -15167,6 +15322,7 @@ define void @s_shuffle_v4p0_v4p0__4_0_0_0() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: s_mov_b32 s12, s4 @@ -15180,6 +15336,7 @@ define void @s_shuffle_v4p0_v4p0__4_0_0_0() { ; ; GFX942-LABEL: s_shuffle_v4p0_v4p0__4_0_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -15425,10 +15582,11 @@ define void @s_shuffle_v4p0_v4p0__7_u_0_0() { ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 ; GFX900-NEXT: s_mov_b32 s12, s4 ; GFX900-NEXT: s_mov_b32 s13, s5 ; GFX900-NEXT: s_mov_b32 s14, s4 @@ -15445,10 +15603,11 @@ define void @s_shuffle_v4p0_v4p0__7_u_0_0() { ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 ; GFX90A-NEXT: s_mov_b32 s12, s4 ; GFX90A-NEXT: s_mov_b32 s13, s5 ; GFX90A-NEXT: s_mov_b32 s14, s4 @@ -15462,14 +15621,15 @@ define void @s_shuffle_v4p0_v4p0__7_u_0_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s0 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s12, s0 ; GFX942-NEXT: s_mov_b32 s13, s1 ; GFX942-NEXT: s_mov_b32 s14, s0 ; GFX942-NEXT: s_mov_b32 s15, s1 @@ -15991,12 +16151,13 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_0() { ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s18 +; GFX900-NEXT: s_mov_b32 s11, s19 ; GFX900-NEXT: s_mov_b32 s14, s4 ; GFX900-NEXT: s_mov_b32 s15, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -16011,12 +16172,13 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_0() { ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s18 +; GFX90A-NEXT: s_mov_b32 s11, s19 ; GFX90A-NEXT: s_mov_b32 s14, s4 ; GFX90A-NEXT: s_mov_b32 s15, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -16028,14 +16190,17 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_0() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s14, s0 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s10, s18 +; GFX942-NEXT: s_mov_b32 s11, s19 +; GFX942-NEXT: s_mov_b32 s14, s0 ; GFX942-NEXT: s_mov_b32 s15, s1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] @@ -16803,10 +16968,11 @@ define void @s_shuffle_v4p0_v4p0__7_u_1_1() { ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 ; GFX900-NEXT: s_mov_b32 s12, s6 ; GFX900-NEXT: s_mov_b32 s13, s7 ; GFX900-NEXT: s_mov_b32 s14, s6 @@ -16823,10 +16989,11 @@ define void @s_shuffle_v4p0_v4p0__7_u_1_1() { ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 ; GFX90A-NEXT: s_mov_b32 s12, s6 ; GFX90A-NEXT: s_mov_b32 s13, s7 ; GFX90A-NEXT: s_mov_b32 s14, s6 @@ -16840,14 +17007,15 @@ define void @s_shuffle_v4p0_v4p0__7_u_1_1() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s12, s2 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s12, s2 ; GFX942-NEXT: s_mov_b32 s13, s3 ; GFX942-NEXT: s_mov_b32 s14, s2 ; GFX942-NEXT: s_mov_b32 s15, s3 @@ -17365,12 +17533,13 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_1() { ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s18 +; GFX900-NEXT: s_mov_b32 s11, s19 ; GFX900-NEXT: s_mov_b32 s14, s6 ; GFX900-NEXT: s_mov_b32 s15, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -17385,12 +17554,13 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_1() { ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s18 +; GFX90A-NEXT: s_mov_b32 s11, s19 ; GFX90A-NEXT: s_mov_b32 s14, s6 ; GFX90A-NEXT: s_mov_b32 s15, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -17402,14 +17572,17 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_1() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:7] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s14, s2 +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[4:11] +; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s10 -; GFX942-NEXT: s_mov_b32 s9, s11 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s10, s18 +; GFX942-NEXT: s_mov_b32 s11, s19 +; GFX942-NEXT: s_mov_b32 s14, s2 ; GFX942-NEXT: s_mov_b32 s15, s3 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:15] @@ -18688,15 +18861,18 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_2() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[20:27] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: s_mov_b32 s8, s26 +; GFX900-NEXT: s_mov_b32 s9, s27 +; GFX900-NEXT: s_mov_b32 s10, s26 +; GFX900-NEXT: s_mov_b32 s11, s27 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -18706,15 +18882,18 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_2() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[20:27] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: s_mov_b32 s8, s26 +; GFX90A-NEXT: s_mov_b32 s9, s27 +; GFX90A-NEXT: s_mov_b32 s10, s26 +; GFX90A-NEXT: s_mov_b32 s11, s27 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -18724,15 +18903,16 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_2() { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[8:15] +; GFX942-NEXT: ; def s[12:19] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 -; GFX942-NEXT: s_mov_b32 s10, s14 -; GFX942-NEXT: s_mov_b32 s11, s15 +; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: s_mov_b32 s9, s19 +; GFX942-NEXT: s_mov_b32 s10, s18 +; GFX942-NEXT: s_mov_b32 s11, s19 ; GFX942-NEXT: s_mov_b32 s14, s4 ; GFX942-NEXT: s_mov_b32 s15, s5 ; GFX942-NEXT: ;;#ASMSTART @@ -20509,6 +20689,7 @@ define void @s_shuffle_v4p0_v4p0__1_4_4_4() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:11] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_mov_b32 s8, s6 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART @@ -20522,6 +20703,7 @@ define void @s_shuffle_v4p0_v4p0__1_4_4_4() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:11] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_mov_b32 s8, s6 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART @@ -20531,6 +20713,7 @@ define void @s_shuffle_v4p0_v4p0__1_4_4_4() { ; ; GFX942-LABEL: s_shuffle_v4p0_v4p0__1_4_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -20552,8 +20735,11 @@ define void @s_shuffle_v4p0_v4p0__2_4_4_4() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -20563,8 +20749,11 @@ define void @s_shuffle_v4p0_v4p0__2_4_4_4() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -20572,6 +20761,7 @@ define void @s_shuffle_v4p0_v4p0__2_4_4_4() { ; ; GFX942-LABEL: s_shuffle_v4p0_v4p0__2_4_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -20593,10 +20783,11 @@ define void @s_shuffle_v4p0_v4p0__3_4_4_4() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -20606,10 +20797,11 @@ define void @s_shuffle_v4p0_v4p0__3_4_4_4() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -20617,6 +20809,7 @@ define void @s_shuffle_v4p0_v4p0__3_4_4_4() { ; ; GFX942-LABEL: s_shuffle_v4p0_v4p0__3_4_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -20838,16 +21031,17 @@ define void @s_shuffle_v4p0_v4p0__7_4_4_4() { define void @s_shuffle_v4p0_v4p0__7_u_4_4() { ; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s4 -; GFX900-NEXT: s_mov_b32 s13, s5 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -20855,16 +21049,17 @@ define void @s_shuffle_v4p0_v4p0__7_u_4_4() { ; ; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s4 -; GFX90A-NEXT: s_mov_b32 s13, s5 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -20872,6 +21067,7 @@ define void @s_shuffle_v4p0_v4p0__7_u_4_4() { ; ; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -21367,14 +21563,17 @@ define void @s_shuffle_v4p0_v4p0__7_7_4_4() { define void @s_shuffle_v4p0_v4p0__7_7_u_4() { ; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s14, s4 -; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s10, s22 +; GFX900-NEXT: s_mov_b32 s11, s23 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -21382,14 +21581,17 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_4() { ; ; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s14, s4 -; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s10, s22 +; GFX90A-NEXT: s_mov_b32 s11, s23 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -21397,6 +21599,7 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_4() { ; ; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -22199,16 +22402,17 @@ define void @s_shuffle_v4p0_v4p0__7_5_5_5() { define void @s_shuffle_v4p0_v4p0__7_u_5_5() { ; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_5_5: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s12, s6 -; GFX900-NEXT: s_mov_b32 s13, s7 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -22216,16 +22420,17 @@ define void @s_shuffle_v4p0_v4p0__7_u_5_5() { ; ; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_5_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s12, s6 -; GFX90A-NEXT: s_mov_b32 s13, s7 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -22233,6 +22438,7 @@ define void @s_shuffle_v4p0_v4p0__7_u_5_5() { ; ; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_u_5_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -22708,14 +22914,17 @@ define void @s_shuffle_v4p0_v4p0__7_7_5_5() { define void @s_shuffle_v4p0_v4p0__7_7_u_5() { ; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_5: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ; def s[16:23] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s11 -; GFX900-NEXT: s_mov_b32 s14, s6 -; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s10, s22 +; GFX900-NEXT: s_mov_b32 s11, s23 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -22723,14 +22932,17 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_5() { ; ; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ; def s[16:23] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s11 -; GFX90A-NEXT: s_mov_b32 s14, s6 -; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s10, s22 +; GFX90A-NEXT: s_mov_b32 s11, s23 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -22738,6 +22950,7 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_5() { ; ; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] @@ -23932,14 +24145,15 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_6() { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ; def s[12:19] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s14 -; GFX900-NEXT: s_mov_b32 s9, s15 -; GFX900-NEXT: s_mov_b32 s10, s14 -; GFX900-NEXT: s_mov_b32 s11, s15 -; GFX900-NEXT: s_mov_b32 s14, s12 -; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s18 +; GFX900-NEXT: s_mov_b32 s11, s19 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:15] ; GFX900-NEXT: ;;#ASMEND @@ -23949,14 +24163,15 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_6() { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ; def s[12:19] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s14 -; GFX90A-NEXT: s_mov_b32 s9, s15 -; GFX90A-NEXT: s_mov_b32 s10, s14 -; GFX90A-NEXT: s_mov_b32 s11, s15 -; GFX90A-NEXT: s_mov_b32 s14, s12 -; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s18 +; GFX90A-NEXT: s_mov_b32 s11, s19 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:15] ; GFX90A-NEXT: ;;#ASMEND @@ -23964,6 +24179,7 @@ define void @s_shuffle_v4p0_v4p0__7_7_u_6() { ; ; GFX942-LABEL: s_shuffle_v4p0_v4p0__7_7_u_6: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:7] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll index 3b5690562c38a..0905f20a8d078 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll @@ -59,35 +59,39 @@ define void @v_shuffle_v4p3_v2p3__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v2p3__1_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v2p3__1_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -112,35 +116,39 @@ define void @v_shuffle_v4p3_v2p3__3_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v2p3__3_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -271,28 +279,30 @@ define void @v_shuffle_v4p3_v2p3__3_2_u_u(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v2p3__3_2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2068,35 +2078,39 @@ define void @v_shuffle_v4p3_v2p3__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ; def v[3:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v2p3__1_2_2_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v2p3__1_2_2_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:1] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2371,43 +2385,47 @@ define void @v_shuffle_v4p3_v2p3__3_3_2_2(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4p3_v2p3__3_3_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ; def v[4:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v2p3__3_3_u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:3] +; GFX942-NEXT: ; def v[4:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3201,6 +3219,7 @@ define void @s_shuffle_v4p3_v2p3__0_u_u_u() { define void @s_shuffle_v4p3_v2p3__1_u_u_u() { ; GFX900-LABEL: s_shuffle_v4p3_v2p3__1_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -3213,6 +3232,7 @@ define void @s_shuffle_v4p3_v2p3__1_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v2p3__1_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -3225,6 +3245,7 @@ define void @s_shuffle_v4p3_v2p3__1_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v2p3__1_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -3257,6 +3278,7 @@ define void @s_shuffle_v4p3_v2p3__2_u_u_u() { define void @s_shuffle_v4p3_v2p3__3_u_u_u() { ; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -3269,6 +3291,7 @@ define void @s_shuffle_v4p3_v2p3__3_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -3281,6 +3304,7 @@ define void @s_shuffle_v4p3_v2p3__3_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v2p3__3_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -3300,6 +3324,7 @@ define void @s_shuffle_v4p3_v2p3__3_u_u_u() { define void @s_shuffle_v4p3_v2p3__3_0_u_u() { ; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_0_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -3316,6 +3341,7 @@ define void @s_shuffle_v4p3_v2p3__3_0_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_0_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -3332,6 +3358,7 @@ define void @s_shuffle_v4p3_v2p3__3_0_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v2p3__3_0_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -3407,6 +3434,7 @@ define void @s_shuffle_v4p3_v2p3__3_1_u_u() { define void @s_shuffle_v4p3_v2p3__3_2_u_u() { ; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_2_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -3420,6 +3448,7 @@ define void @s_shuffle_v4p3_v2p3__3_2_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -3433,6 +3462,7 @@ define void @s_shuffle_v4p3_v2p3__3_2_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v2p3__3_2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -4731,6 +4761,7 @@ define void @s_shuffle_v4p3_v2p3__0_2_2_2() { define void @s_shuffle_v4p3_v2p3__1_2_2_2() { ; GFX900-LABEL: s_shuffle_v4p3_v2p3__1_2_2_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -4743,6 +4774,7 @@ define void @s_shuffle_v4p3_v2p3__1_2_2_2() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v2p3__1_2_2_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -4755,6 +4787,7 @@ define void @s_shuffle_v4p3_v2p3__1_2_2_2() { ; ; GFX942-LABEL: s_shuffle_v4p3_v2p3__1_2_2_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] @@ -4930,6 +4963,7 @@ define void @s_shuffle_v4p3_v2p3__3_3_2_2() { define void @s_shuffle_v4p3_v2p3__3_3_u_2() { ; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_3_u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:5] @@ -4944,6 +4978,7 @@ define void @s_shuffle_v4p3_v2p3__3_3_u_2() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_3_u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:5] @@ -4958,6 +4993,7 @@ define void @s_shuffle_v4p3_v2p3__3_3_u_2() { ; ; GFX942-LABEL: s_shuffle_v4p3_v2p3__3_3_u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll index 8039e126590b9..6453be2e7a548 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll @@ -59,35 +59,39 @@ define void @v_shuffle_v4p3_v3p3__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__1_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__1_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -101,11 +105,12 @@ define void @v_shuffle_v4p3_v3p3__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -113,11 +118,12 @@ define void @v_shuffle_v4p3_v3p3__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -125,11 +131,12 @@ define void @v_shuffle_v4p3_v3p3__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -154,35 +161,39 @@ define void @v_shuffle_v4p3_v3p3__4_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__4_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__4_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -197,11 +208,12 @@ define void @v_shuffle_v4p3_v3p3__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -209,11 +221,12 @@ define void @v_shuffle_v4p3_v3p3__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -221,11 +234,12 @@ define void @v_shuffle_v4p3_v3p3__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -255,15 +269,16 @@ define void @v_shuffle_v4p3_v3p3__5_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -271,15 +286,16 @@ define void @v_shuffle_v4p3_v3p3__5_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v9, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -347,15 +363,16 @@ define void @v_shuffle_v4p3_v3p3__5_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -363,15 +380,16 @@ define void @v_shuffle_v4p3_v3p3__5_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -379,15 +397,16 @@ define void @v_shuffle_v4p3_v3p3__5_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -412,28 +431,30 @@ define void @v_shuffle_v4p3_v3p3__5_3_u_u(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_3_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -491,12 +512,13 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -504,12 +526,13 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -517,12 +540,13 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -537,15 +561,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -553,15 +579,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -569,16 +597,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v9, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: v_mov_b32_e32 v1, v8 +; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -609,16 +638,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -626,17 +656,18 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_u(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -705,41 +736,47 @@ define void @v_shuffle_v4p3_v3p3__5_5_2_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4p3_v3p3__5_5_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -765,29 +802,32 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_u(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_4_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1179,29 +1219,32 @@ define void @v_shuffle_v4p3_v3p3__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__u_0_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__u_0_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1374,29 +1417,32 @@ define void @v_shuffle_v4p3_v3p3__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__3_0_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__3_0_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1529,16 +1575,17 @@ define void @v_shuffle_v4p3_v3p3__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: ; def v[5:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1546,16 +1593,17 @@ define void @v_shuffle_v4p3_v3p3__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1563,16 +1611,17 @@ define void @v_shuffle_v4p3_v3p3__5_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v9, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 ; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1884,16 +1933,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1901,16 +1951,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1918,16 +1969,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v9, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: v_mov_b32_e32 v1, v8 +; GFX942-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2545,16 +2597,17 @@ define void @v_shuffle_v4p3_v3p3__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2562,17 +2615,18 @@ define void @v_shuffle_v4p3_v3p3__5_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2876,16 +2930,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ; def v[5:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v1, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2893,15 +2948,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2909,16 +2966,18 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3268,29 +3327,31 @@ define void @v_shuffle_v4p3_v3p3__1_2_2_2(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__1_2_2_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__1_2_2_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3416,8 +3477,9 @@ define void @v_shuffle_v4p3_v3p3__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND @@ -3433,6 +3495,7 @@ define void @v_shuffle_v4p3_v3p3__4_2_2_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART @@ -3871,16 +3934,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[6:8] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v6 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3888,16 +3952,17 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[6:8] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v6 -; GFX942-NEXT: v_mov_b32_e32 v1, v6 -; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, v8 +; GFX942-NEXT: v_mov_b32_e32 v1, v8 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3986,14 +4051,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6 ; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] @@ -4004,14 +4070,15 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_2(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v7, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[4:6] +; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[4:5] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v1, v6 ; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] @@ -4108,8 +4175,9 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND @@ -4126,6 +4194,7 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_2(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: ;;#ASMSTART @@ -4200,35 +4269,39 @@ define void @v_shuffle_v4p3_v3p3__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__1_3_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__1_3_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4242,11 +4315,12 @@ define void @v_shuffle_v4p3_v3p3__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ; def v[2:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4254,11 +4328,12 @@ define void @v_shuffle_v4p3_v3p3__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4266,11 +4341,12 @@ define void @v_shuffle_v4p3_v3p3__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4395,36 +4471,39 @@ define void @v_shuffle_v4p3_v3p3__5_3_3_3(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4p3_v3p3__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_u_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_u_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_u_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND @@ -4432,7 +4511,7 @@ define void @v_shuffle_v4p3_v3p3__5_u_3_3(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: v_mov_b32_e32 v0, v6 ; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: v_mov_b32_e32 v3, v4 -; GFX942-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -4726,43 +4805,47 @@ define void @v_shuffle_v4p3_v3p3__5_5_3_3(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4p3_v3p3__5_5_u_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_u_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ; def v[4:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_u_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_u_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5375,29 +5458,32 @@ define void @v_shuffle_v4p3_v3p3__5_u_4_4(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_u_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_u_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v2, v5 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5684,40 +5770,45 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ; def v[3:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ; def v[4:6] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[2:4] +; GFX942-NEXT: ; def v[4:6] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_mov_b32_e32 v1, v4 -; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v6 +; GFX942-NEXT: v_mov_b32_e32 v3, v5 +; GFX942-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6083,8 +6174,9 @@ define void @v_shuffle_v4p3_v3p3__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] @@ -6101,6 +6193,7 @@ define void @v_shuffle_v4p3_v3p3__1_5_5_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] @@ -6237,29 +6330,31 @@ define void @v_shuffle_v4p3_v3p3__4_5_5_5(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__4_5_5_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__4_5_5_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:2] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6605,13 +6700,14 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6619,13 +6715,14 @@ define void @v_shuffle_v4p3_v3p3__5_5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def v[0:2] +; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: v_mov_b32_e32 v1, v2 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, v4 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: v_mov_b32_e32 v3, v4 +; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6721,9 +6818,11 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 ; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -6739,9 +6838,11 @@ define void @v_shuffle_v4p3_v3p3__5_5_1_5(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 +; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, v4 -; GFX942-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] ; GFX942-NEXT: v_mov_b32_e32 v1, v4 ; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -6877,6 +6978,7 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; ; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] @@ -6891,6 +6993,7 @@ define void @v_shuffle_v4p3_v3p3__5_5_4_5(ptr addrspace(1) inreg %ptr) { ; ; GFX942-LABEL: v_shuffle_v4p3_v3p3__5_5_4_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:4] @@ -6966,6 +7069,7 @@ define void @s_shuffle_v4p3_v3p3__0_u_u_u() { define void @s_shuffle_v4p3_v3p3__1_u_u_u() { ; GFX900-LABEL: s_shuffle_v4p3_v3p3__1_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -6978,6 +7082,7 @@ define void @s_shuffle_v4p3_v3p3__1_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v3p3__1_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -6990,6 +7095,7 @@ define void @s_shuffle_v4p3_v3p3__1_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v3p3__1_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7008,6 +7114,7 @@ define void @s_shuffle_v4p3_v3p3__1_u_u_u() { define void @s_shuffle_v4p3_v3p3__2_u_u_u() { ; GFX900-LABEL: s_shuffle_v4p3_v3p3__2_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -7020,6 +7127,7 @@ define void @s_shuffle_v4p3_v3p3__2_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v3p3__2_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -7032,6 +7140,7 @@ define void @s_shuffle_v4p3_v3p3__2_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v3p3__2_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7064,6 +7173,7 @@ define void @s_shuffle_v4p3_v3p3__3_u_u_u() { define void @s_shuffle_v4p3_v3p3__4_u_u_u() { ; GFX900-LABEL: s_shuffle_v4p3_v3p3__4_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -7076,6 +7186,7 @@ define void @s_shuffle_v4p3_v3p3__4_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v3p3__4_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -7088,6 +7199,7 @@ define void @s_shuffle_v4p3_v3p3__4_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v3p3__4_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7107,6 +7219,7 @@ define void @s_shuffle_v4p3_v3p3__4_u_u_u() { define void @s_shuffle_v4p3_v3p3__5_u_u_u() { ; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -7119,6 +7232,7 @@ define void @s_shuffle_v4p3_v3p3__5_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -7131,6 +7245,7 @@ define void @s_shuffle_v4p3_v3p3__5_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v3p3__5_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7150,14 +7265,15 @@ define void @s_shuffle_v4p3_v3p3__5_u_u_u() { define void @s_shuffle_v4p3_v3p3__5_0_u_u() { ; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_0_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s9, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -7166,14 +7282,15 @@ define void @s_shuffle_v4p3_v3p3__5_0_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_0_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s9, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -7182,6 +7299,7 @@ define void @s_shuffle_v4p3_v3p3__5_0_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v3p3__5_0_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7257,14 +7375,15 @@ define void @s_shuffle_v4p3_v3p3__5_1_u_u() { define void @s_shuffle_v4p3_v3p3__5_2_u_u() { ; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_2_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s9, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -7273,14 +7392,15 @@ define void @s_shuffle_v4p3_v3p3__5_2_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s9, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -7289,6 +7409,7 @@ define void @s_shuffle_v4p3_v3p3__5_2_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v3p3__5_2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7312,6 +7433,7 @@ define void @s_shuffle_v4p3_v3p3__5_2_u_u() { define void @s_shuffle_v4p3_v3p3__5_3_u_u() { ; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_3_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -7325,6 +7447,7 @@ define void @s_shuffle_v4p3_v3p3__5_3_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_3_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -7338,6 +7461,7 @@ define void @s_shuffle_v4p3_v3p3__5_3_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v3p3__5_3_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7377,6 +7501,7 @@ define void @s_shuffle_v4p3_v3p3__5_4_u_u() { define void @s_shuffle_v4p3_v3p3__5_5_u_u() { ; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -7390,6 +7515,7 @@ define void @s_shuffle_v4p3_v3p3__5_5_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -7403,6 +7529,7 @@ define void @s_shuffle_v4p3_v3p3__5_5_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v3p3__5_5_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7423,15 +7550,16 @@ define void @s_shuffle_v4p3_v3p3__5_5_u_u() { define void @s_shuffle_v4p3_v3p3__5_5_0_u() { ; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_0_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s14 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -7440,15 +7568,16 @@ define void @s_shuffle_v4p3_v3p3__5_5_0_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_0_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s14 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -7457,6 +7586,7 @@ define void @s_shuffle_v4p3_v3p3__5_5_0_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v3p3__5_5_0_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7481,15 +7611,16 @@ define void @s_shuffle_v4p3_v3p3__5_5_0_u() { define void @s_shuffle_v4p3_v3p3__5_5_1_u() { ; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_1_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s14 ; GFX900-NEXT: s_mov_b32 s10, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -7498,15 +7629,16 @@ define void @s_shuffle_v4p3_v3p3__5_5_1_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_1_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s14 ; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -7515,6 +7647,7 @@ define void @s_shuffle_v4p3_v3p3__5_5_1_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v3p3__5_5_1_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7594,6 +7727,7 @@ define void @s_shuffle_v4p3_v3p3__5_5_2_u() { define void @s_shuffle_v4p3_v3p3__5_5_3_u() { ; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -7608,6 +7742,7 @@ define void @s_shuffle_v4p3_v3p3__5_5_3_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -7622,6 +7757,7 @@ define void @s_shuffle_v4p3_v3p3__5_5_3_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v3p3__5_5_3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -7643,6 +7779,7 @@ define void @s_shuffle_v4p3_v3p3__5_5_3_u() { define void @s_shuffle_v4p3_v3p3__5_5_4_u() { ; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_4_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -7657,6 +7794,7 @@ define void @s_shuffle_v4p3_v3p3__5_5_4_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_4_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -7671,6 +7809,7 @@ define void @s_shuffle_v4p3_v3p3__5_5_4_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v3p3__5_5_4_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -8011,6 +8150,7 @@ define void @s_shuffle_v4p3_v3p3__5_5_5_5() { define void @s_shuffle_v4p3_v3p3__u_0_0_0() { ; GFX900-LABEL: s_shuffle_v4p3_v3p3__u_0_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -8025,6 +8165,7 @@ define void @s_shuffle_v4p3_v3p3__u_0_0_0() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v3p3__u_0_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -8039,6 +8180,7 @@ define void @s_shuffle_v4p3_v3p3__u_0_0_0() { ; ; GFX942-LABEL: s_shuffle_v4p3_v3p3__u_0_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -8181,6 +8323,7 @@ define void @s_shuffle_v4p3_v3p3__2_0_0_0() { define void @s_shuffle_v4p3_v3p3__3_0_0_0() { ; GFX900-LABEL: s_shuffle_v4p3_v3p3__3_0_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -8195,6 +8338,7 @@ define void @s_shuffle_v4p3_v3p3__3_0_0_0() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v3p3__3_0_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -8209,6 +8353,7 @@ define void @s_shuffle_v4p3_v3p3__3_0_0_0() { ; ; GFX942-LABEL: s_shuffle_v4p3_v3p3__3_0_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -8351,14 +8496,15 @@ define void @s_shuffle_v4p3_v3p3__5_0_0_0() { define void @s_shuffle_v4p3_v3p3__5_u_0_0() { ; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_u_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s4 ; GFX900-NEXT: ;;#ASMSTART @@ -8368,14 +8514,15 @@ define void @s_shuffle_v4p3_v3p3__5_u_0_0() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_u_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s4 ; GFX90A-NEXT: ;;#ASMSTART @@ -8385,6 +8532,7 @@ define void @s_shuffle_v4p3_v3p3__5_u_0_0() { ; ; GFX942-LABEL: s_shuffle_v4p3_v3p3__5_u_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -8711,15 +8859,16 @@ define void @s_shuffle_v4p3_v3p3__5_5_0_0() { define void @s_shuffle_v4p3_v3p3__5_5_u_0() { ; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_u_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s14 ; GFX900-NEXT: s_mov_b32 s11, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -8728,15 +8877,16 @@ define void @s_shuffle_v4p3_v3p3__5_5_u_0() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_u_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s14 ; GFX90A-NEXT: s_mov_b32 s11, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -8745,6 +8895,7 @@ define void @s_shuffle_v4p3_v3p3__5_5_u_0() { ; ; GFX942-LABEL: s_shuffle_v4p3_v3p3__5_5_u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -9226,14 +9377,15 @@ define void @s_shuffle_v4p3_v3p3__5_1_1_1() { define void @s_shuffle_v4p3_v3p3__5_u_1_1() { ; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_u_1_1: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s10, s5 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -9243,14 +9395,15 @@ define void @s_shuffle_v4p3_v3p3__5_u_1_1() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_u_1_1: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -9260,6 +9413,7 @@ define void @s_shuffle_v4p3_v3p3__5_u_1_1() { ; ; GFX942-LABEL: s_shuffle_v4p3_v3p3__5_u_1_1: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -9586,15 +9740,16 @@ define void @s_shuffle_v4p3_v3p3__5_5_1_1() { define void @s_shuffle_v4p3_v3p3__5_5_u_1() { ; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_u_1: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s14 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -9603,15 +9758,16 @@ define void @s_shuffle_v4p3_v3p3__5_5_u_1() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_u_1: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s14 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -9620,6 +9776,7 @@ define void @s_shuffle_v4p3_v3p3__5_5_u_1() { ; ; GFX942-LABEL: s_shuffle_v4p3_v3p3__5_5_u_1: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -10446,15 +10603,16 @@ define void @s_shuffle_v4p3_v3p3__5_5_2_2() { define void @s_shuffle_v4p3_v3p3__5_5_u_2() { ; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ; def s[4:6] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ; def s[12:14] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s10 -; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s14 ; GFX900-NEXT: s_mov_b32 s11, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -10463,15 +10621,16 @@ define void @s_shuffle_v4p3_v3p3__5_5_u_2() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ; def s[4:6] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ; def s[12:14] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s10 -; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s14 ; GFX90A-NEXT: s_mov_b32 s11, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -10480,6 +10639,7 @@ define void @s_shuffle_v4p3_v3p3__5_5_u_2() { ; ; GFX942-LABEL: s_shuffle_v4p3_v3p3__5_5_u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -10802,6 +10962,7 @@ define void @s_shuffle_v4p3_v3p3__0_3_3_3() { define void @s_shuffle_v4p3_v3p3__1_3_3_3() { ; GFX900-LABEL: s_shuffle_v4p3_v3p3__1_3_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -10814,6 +10975,7 @@ define void @s_shuffle_v4p3_v3p3__1_3_3_3() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v3p3__1_3_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -10826,6 +10988,7 @@ define void @s_shuffle_v4p3_v3p3__1_3_3_3() { ; ; GFX942-LABEL: s_shuffle_v4p3_v3p3__1_3_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -10844,6 +11007,7 @@ define void @s_shuffle_v4p3_v3p3__1_3_3_3() { define void @s_shuffle_v4p3_v3p3__2_3_3_3() { ; GFX900-LABEL: s_shuffle_v4p3_v3p3__2_3_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -10856,6 +11020,7 @@ define void @s_shuffle_v4p3_v3p3__2_3_3_3() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v3p3__2_3_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -10868,6 +11033,7 @@ define void @s_shuffle_v4p3_v3p3__2_3_3_3() { ; ; GFX942-LABEL: s_shuffle_v4p3_v3p3__2_3_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -11004,6 +11170,7 @@ define void @s_shuffle_v4p3_v3p3__5_3_3_3() { define void @s_shuffle_v4p3_v3p3__5_u_3_3() { ; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_u_3_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -11018,6 +11185,7 @@ define void @s_shuffle_v4p3_v3p3__5_u_3_3() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_u_3_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -11032,6 +11200,7 @@ define void @s_shuffle_v4p3_v3p3__5_u_3_3() { ; ; GFX942-LABEL: s_shuffle_v4p3_v3p3__5_u_3_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -11337,6 +11506,7 @@ define void @s_shuffle_v4p3_v3p3__5_5_3_3() { define void @s_shuffle_v4p3_v3p3__5_5_u_3() { ; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_u_3: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -11351,6 +11521,7 @@ define void @s_shuffle_v4p3_v3p3__5_5_u_3() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_u_3: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -11365,6 +11536,7 @@ define void @s_shuffle_v4p3_v3p3__5_5_u_3() { ; ; GFX942-LABEL: s_shuffle_v4p3_v3p3__5_5_u_3: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -11874,6 +12046,7 @@ define void @s_shuffle_v4p3_v3p3__5_4_4_4() { define void @s_shuffle_v4p3_v3p3__5_u_4_4() { ; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_u_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -11888,6 +12061,7 @@ define void @s_shuffle_v4p3_v3p3__5_u_4_4() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_u_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -11902,6 +12076,7 @@ define void @s_shuffle_v4p3_v3p3__5_u_4_4() { ; ; GFX942-LABEL: s_shuffle_v4p3_v3p3__5_u_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -12207,6 +12382,7 @@ define void @s_shuffle_v4p3_v3p3__5_5_4_4() { define void @s_shuffle_v4p3_v3p3__5_5_u_4() { ; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_u_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -12221,6 +12397,7 @@ define void @s_shuffle_v4p3_v3p3__5_5_u_4() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -12235,6 +12412,7 @@ define void @s_shuffle_v4p3_v3p3__5_5_u_4() { ; ; GFX942-LABEL: s_shuffle_v4p3_v3p3__5_5_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] @@ -12989,6 +13167,7 @@ define void @s_shuffle_v4p3_v3p3__5_4_5_5() { define void @s_shuffle_v4p3_v3p3__5_5_u_5() { ; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_u_5: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:6] @@ -13003,6 +13182,7 @@ define void @s_shuffle_v4p3_v3p3__5_5_u_5() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_u_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:6] @@ -13017,6 +13197,7 @@ define void @s_shuffle_v4p3_v3p3__5_5_u_5() { ; ; GFX942-LABEL: s_shuffle_v4p3_v3p3__5_5_u_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:2] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll index eeab42ae40d7f..c8ceae975e063 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll @@ -61,9 +61,10 @@ define void @v_shuffle_v4p3_v4p3__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -73,9 +74,10 @@ define void @v_shuffle_v4p3_v4p3__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -85,9 +87,10 @@ define void @v_shuffle_v4p3_v4p3__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -103,33 +106,37 @@ define void @v_shuffle_v4p3_v4p3__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__2_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__2_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -141,37 +148,41 @@ define void @v_shuffle_v4p3_v4p3__2_u_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4p3_v4p3__3_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__3_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__3_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__3_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -198,9 +209,10 @@ define void @v_shuffle_v4p3_v4p3__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -210,9 +222,10 @@ define void @v_shuffle_v4p3_v4p3__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -222,9 +235,10 @@ define void @v_shuffle_v4p3_v4p3__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -241,33 +255,37 @@ define void @v_shuffle_v4p3_v4p3__6_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__6_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__6_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -280,37 +298,41 @@ define void @v_shuffle_v4p3_v4p3__6_u_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4p3_v4p3__7_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -327,13 +349,14 @@ define void @v_shuffle_v4p3_v4p3__7_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7_vgpr8 +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v9, v[5:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -434,13 +457,14 @@ define void @v_shuffle_v4p3_v4p3__7_2_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ; implicit-def: $vgpr7_vgpr8_vgpr9_vgpr10 +; GFX900-NEXT: v_mov_b32_e32 v11, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v7, v6 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: global_store_dwordx4 v11, v[7:10], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -484,49 +508,53 @@ define void @v_shuffle_v4p3_v4p3__7_2_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4p3_v4p3__7_3_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_3_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v7 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_3_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v7 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_3_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v7 +; GFX942-NEXT: v_mov_b32_e32 v9, v3 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -539,14 +567,15 @@ define void @v_shuffle_v4p3_v4p3__7_3_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4p3_v4p3__7_4_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_4_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -626,14 +655,15 @@ define void @v_shuffle_v4p3_v4p3__7_5_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4p3_v4p3__7_6_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_6_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -670,40 +700,44 @@ define void @v_shuffle_v4p3_v4p3__7_6_u_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4p3_v4p3__7_7_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -720,14 +754,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7_vgpr8 +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v0 +; GFX900-NEXT: global_store_dwordx4 v9, v[5:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -737,14 +772,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -754,15 +790,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_0_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -779,14 +815,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -796,14 +833,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -813,15 +851,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_1_u(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -890,52 +928,56 @@ define void @v_shuffle_v4p3_v4p3__7_7_2_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4p3_v4p3__7_7_3_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[4:7] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-NEXT: v_mov_b32_e32 v1, v7 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v8, v7 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v10, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v7 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v10, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v7 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: v_mov_b32_e32 v10, v3 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -948,41 +990,47 @@ define void @v_shuffle_v4p3_v4p3__7_7_3_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4p3_v4p3__7_7_4_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_4_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_4_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -995,40 +1043,47 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4p3_v4p3__7_7_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_5_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v1 -; GFX900-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v1 -; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_5_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1087,43 +1142,47 @@ define void @v_shuffle_v4p3_v4p3__7_7_6_u(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4p3_v4p3__7_7_7_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_7_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_7_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_7_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1578,11 +1637,12 @@ define void @v_shuffle_v4p3_v4p3__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1592,11 +1652,12 @@ define void @v_shuffle_v4p3_v4p3__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1605,12 +1666,13 @@ define void @v_shuffle_v4p3_v4p3__u_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] -; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1820,11 +1882,12 @@ define void @v_shuffle_v4p3_v4p3__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -1834,11 +1897,12 @@ define void @v_shuffle_v4p3_v4p3__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1848,11 +1912,12 @@ define void @v_shuffle_v4p3_v4p3__4_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v4, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, v0 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2050,14 +2115,15 @@ define void @v_shuffle_v4p3_v4p3__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7_vgpr8 +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v0 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: global_store_dwordx4 v9, v[5:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2067,14 +2133,15 @@ define void @v_shuffle_v4p3_v4p3__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2084,15 +2151,15 @@ define void @v_shuffle_v4p3_v4p3__7_u_0_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2529,14 +2596,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr5_vgpr6_vgpr7_vgpr8 +; GFX900-NEXT: v_mov_b32_e32 v9, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[1:4] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: global_store_dwordx4 v9, v[5:8], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2546,14 +2614,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2563,15 +2632,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_u_0(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v0 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3400,14 +3469,15 @@ define void @v_shuffle_v4p3_v4p3__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v5 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v1 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3417,14 +3487,15 @@ define void @v_shuffle_v4p3_v4p3__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3434,15 +3505,15 @@ define void @v_shuffle_v4p3_v4p3__7_u_1_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v5 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v8, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3877,14 +3948,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX900-NEXT: v_mov_b32_e32 v10, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[2:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v5 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -3894,14 +3966,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3911,15 +3984,15 @@ define void @v_shuffle_v4p3_v4p3__7_7_u_1(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[2:5] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v5 -; GFX942-NEXT: v_mov_b32_e32 v3, v5 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v9, v1 +; GFX942-NEXT: v_mov_b32_e32 v6, v5 +; GFX942-NEXT: v_mov_b32_e32 v7, v5 +; GFX942-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -5196,48 +5269,52 @@ define void @v_shuffle_v4p3_v4p3__7_7_u_2(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ; implicit-def: $vgpr7_vgpr8_vgpr9_vgpr10 +; GFX900-NEXT: v_mov_b32_e32 v11, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[3:6] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v7, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v7, v6 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v10, v2 +; GFX900-NEXT: global_store_dwordx4 v11, v[7:10], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v7 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v11, v2 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v12, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[4:7] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: v_mov_b32_e32 v0, v7 -; GFX942-NEXT: v_mov_b32_e32 v1, v7 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v8, v7 +; GFX942-NEXT: v_mov_b32_e32 v9, v7 +; GFX942-NEXT: v_mov_b32_e32 v11, v2 +; GFX942-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -6974,9 +7051,10 @@ define void @v_shuffle_v4p3_v4p3__1_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -6986,9 +7064,10 @@ define void @v_shuffle_v4p3_v4p3__1_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6998,9 +7077,10 @@ define void @v_shuffle_v4p3_v4p3__1_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, v1 +; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7016,33 +7096,37 @@ define void @v_shuffle_v4p3_v4p3__2_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__2_4_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__2_4_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7054,37 +7138,41 @@ define void @v_shuffle_v4p3_v4p3__2_4_4_4(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4p3_v4p3__3_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__3_4_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__3_4_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__3_4_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7256,43 +7344,47 @@ define void @v_shuffle_v4p3_v4p3__7_4_4_4(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4p3_v4p3__7_u_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_u_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_u_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_u_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v0 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -7694,41 +7786,47 @@ define void @v_shuffle_v4p3_v4p3__7_7_4_4(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4p3_v4p3__7_7_u_4(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_u_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v4, v0 -; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v0 -; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -8548,43 +8646,47 @@ define void @v_shuffle_v4p3_v4p3__7_5_5_5(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4p3_v4p3__7_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_u_5_5: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_u_5_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_u_5_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v6, v1 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -8980,40 +9082,47 @@ define void @v_shuffle_v4p3_v4p3__7_7_5_5(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4p3_v4p3__7_7_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_u_5: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v3 -; GFX900-NEXT: v_mov_b32_e32 v5, v1 -; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_u_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_u_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v3 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v1 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -10248,43 +10357,47 @@ define void @v_shuffle_v4p3_v4p3__7_7_6_6(ptr addrspace(1) inreg %ptr) { define void @v_shuffle_v4p3_v4p3__7_7_u_6(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_u_6: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:3] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v4, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-NEXT: v_mov_b32_e32 v3, v2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_u_6: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: v_shuffle_v4p3_v4p3__7_7_u_6: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def v[0:3] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_mov_b32_e32 v0, v3 -; GFX942-NEXT: v_mov_b32_e32 v1, v3 -; GFX942-NEXT: v_mov_b32_e32 v3, v2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mov_b32_e32 v4, v3 +; GFX942-NEXT: v_mov_b32_e32 v5, v3 +; GFX942-NEXT: v_mov_b32_e32 v7, v2 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -11967,6 +12080,7 @@ define void @s_shuffle_v4p3_v4p3__0_u_u_u() { define void @s_shuffle_v4p3_v4p3__1_u_u_u() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__1_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -11979,6 +12093,7 @@ define void @s_shuffle_v4p3_v4p3__1_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__1_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -11991,6 +12106,7 @@ define void @s_shuffle_v4p3_v4p3__1_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__1_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12009,6 +12125,7 @@ define void @s_shuffle_v4p3_v4p3__1_u_u_u() { define void @s_shuffle_v4p3_v4p3__2_u_u_u() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__2_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12021,6 +12138,7 @@ define void @s_shuffle_v4p3_v4p3__2_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__2_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12033,6 +12151,7 @@ define void @s_shuffle_v4p3_v4p3__2_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__2_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12051,6 +12170,7 @@ define void @s_shuffle_v4p3_v4p3__2_u_u_u() { define void @s_shuffle_v4p3_v4p3__3_u_u_u() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__3_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12063,6 +12183,7 @@ define void @s_shuffle_v4p3_v4p3__3_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__3_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12075,6 +12196,7 @@ define void @s_shuffle_v4p3_v4p3__3_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__3_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12107,6 +12229,7 @@ define void @s_shuffle_v4p3_v4p3__4_u_u_u() { define void @s_shuffle_v4p3_v4p3__5_u_u_u() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__5_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12119,6 +12242,7 @@ define void @s_shuffle_v4p3_v4p3__5_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__5_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12131,6 +12255,7 @@ define void @s_shuffle_v4p3_v4p3__5_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__5_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12150,6 +12275,7 @@ define void @s_shuffle_v4p3_v4p3__5_u_u_u() { define void @s_shuffle_v4p3_v4p3__6_u_u_u() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__6_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12162,6 +12288,7 @@ define void @s_shuffle_v4p3_v4p3__6_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__6_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12174,6 +12301,7 @@ define void @s_shuffle_v4p3_v4p3__6_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__6_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12193,6 +12321,7 @@ define void @s_shuffle_v4p3_v4p3__6_u_u_u() { define void @s_shuffle_v4p3_v4p3__7_u_u_u() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_u_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12205,6 +12334,7 @@ define void @s_shuffle_v4p3_v4p3__7_u_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_u_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12217,6 +12347,7 @@ define void @s_shuffle_v4p3_v4p3__7_u_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__7_u_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12236,14 +12367,15 @@ define void @s_shuffle_v4p3_v4p3__7_u_u_u() { define void @s_shuffle_v4p3_v4p3__7_0_u_u() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_0_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 ; GFX900-NEXT: s_mov_b32 s9, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -12252,14 +12384,15 @@ define void @s_shuffle_v4p3_v4p3__7_0_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_0_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 ; GFX90A-NEXT: s_mov_b32 s9, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -12268,6 +12401,7 @@ define void @s_shuffle_v4p3_v4p3__7_0_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__7_0_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12343,14 +12477,15 @@ define void @s_shuffle_v4p3_v4p3__7_1_u_u() { define void @s_shuffle_v4p3_v4p3__7_2_u_u() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_2_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 ; GFX900-NEXT: s_mov_b32 s9, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -12359,14 +12494,15 @@ define void @s_shuffle_v4p3_v4p3__7_2_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_2_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 ; GFX90A-NEXT: s_mov_b32 s9, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -12375,6 +12511,7 @@ define void @s_shuffle_v4p3_v4p3__7_2_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__7_2_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12398,14 +12535,15 @@ define void @s_shuffle_v4p3_v4p3__7_2_u_u() { define void @s_shuffle_v4p3_v4p3__7_3_u_u() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_3_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 ; GFX900-NEXT: s_mov_b32 s9, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -12414,14 +12552,15 @@ define void @s_shuffle_v4p3_v4p3__7_3_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_3_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 ; GFX90A-NEXT: s_mov_b32 s9, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -12430,6 +12569,7 @@ define void @s_shuffle_v4p3_v4p3__7_3_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__7_3_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12453,6 +12593,7 @@ define void @s_shuffle_v4p3_v4p3__7_3_u_u() { define void @s_shuffle_v4p3_v4p3__7_4_u_u() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_4_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12466,6 +12607,7 @@ define void @s_shuffle_v4p3_v4p3__7_4_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_4_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12479,6 +12621,7 @@ define void @s_shuffle_v4p3_v4p3__7_4_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__7_4_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12518,6 +12661,7 @@ define void @s_shuffle_v4p3_v4p3__7_5_u_u() { define void @s_shuffle_v4p3_v4p3__7_6_u_u() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_6_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12531,6 +12675,7 @@ define void @s_shuffle_v4p3_v4p3__7_6_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_6_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12544,6 +12689,7 @@ define void @s_shuffle_v4p3_v4p3__7_6_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__7_6_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12564,6 +12710,7 @@ define void @s_shuffle_v4p3_v4p3__7_6_u_u() { define void @s_shuffle_v4p3_v4p3__7_7_u_u() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_u_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12577,6 +12724,7 @@ define void @s_shuffle_v4p3_v4p3__7_7_u_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_u_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12590,6 +12738,7 @@ define void @s_shuffle_v4p3_v4p3__7_7_u_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__7_7_u_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12610,15 +12759,16 @@ define void @s_shuffle_v4p3_v4p3__7_7_u_u() { define void @s_shuffle_v4p3_v4p3__7_7_0_u() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_0_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -12627,15 +12777,16 @@ define void @s_shuffle_v4p3_v4p3__7_7_0_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_0_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -12644,6 +12795,7 @@ define void @s_shuffle_v4p3_v4p3__7_7_0_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__7_7_0_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12668,15 +12820,16 @@ define void @s_shuffle_v4p3_v4p3__7_7_0_u() { define void @s_shuffle_v4p3_v4p3__7_7_1_u() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_1_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s10, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -12685,15 +12838,16 @@ define void @s_shuffle_v4p3_v4p3__7_7_1_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_1_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -12702,6 +12856,7 @@ define void @s_shuffle_v4p3_v4p3__7_7_1_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__7_7_1_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12781,15 +12936,16 @@ define void @s_shuffle_v4p3_v4p3__7_7_2_u() { define void @s_shuffle_v4p3_v4p3__7_7_3_u() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_3_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s10, s7 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -12798,15 +12954,16 @@ define void @s_shuffle_v4p3_v4p3__7_7_3_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_3_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s10, s7 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -12815,6 +12972,7 @@ define void @s_shuffle_v4p3_v4p3__7_7_3_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__7_7_3_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12839,6 +12997,7 @@ define void @s_shuffle_v4p3_v4p3__7_7_3_u() { define void @s_shuffle_v4p3_v4p3__7_7_4_u() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_4_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12853,6 +13012,7 @@ define void @s_shuffle_v4p3_v4p3__7_7_4_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_4_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12867,6 +13027,7 @@ define void @s_shuffle_v4p3_v4p3__7_7_4_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__7_7_4_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12888,6 +13049,7 @@ define void @s_shuffle_v4p3_v4p3__7_7_4_u() { define void @s_shuffle_v4p3_v4p3__7_7_5_u() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_5_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12902,6 +13064,7 @@ define void @s_shuffle_v4p3_v4p3__7_7_5_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_5_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12916,6 +13079,7 @@ define void @s_shuffle_v4p3_v4p3__7_7_5_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__7_7_5_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -12957,6 +13121,7 @@ define void @s_shuffle_v4p3_v4p3__7_7_6_u() { define void @s_shuffle_v4p3_v4p3__7_7_7_u() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_7_u: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -12971,6 +13136,7 @@ define void @s_shuffle_v4p3_v4p3__7_7_7_u() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_7_u: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -12985,6 +13151,7 @@ define void @s_shuffle_v4p3_v4p3__7_7_7_u() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__7_7_7_u: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -13424,6 +13591,7 @@ define void @s_shuffle_v4p3_v4p3__7_7_7_7() { define void @s_shuffle_v4p3_v4p3__u_0_0_0() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__u_0_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -13438,6 +13606,7 @@ define void @s_shuffle_v4p3_v4p3__u_0_0_0() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__u_0_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -13452,6 +13621,7 @@ define void @s_shuffle_v4p3_v4p3__u_0_0_0() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__u_0_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -13645,6 +13815,7 @@ define void @s_shuffle_v4p3_v4p3__3_0_0_0() { define void @s_shuffle_v4p3_v4p3__4_0_0_0() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__4_0_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -13659,6 +13830,7 @@ define void @s_shuffle_v4p3_v4p3__4_0_0_0() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__4_0_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -13673,6 +13845,7 @@ define void @s_shuffle_v4p3_v4p3__4_0_0_0() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__4_0_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -13876,14 +14049,15 @@ define void @s_shuffle_v4p3_v4p3__7_0_0_0() { define void @s_shuffle_v4p3_v4p3__7_u_0_0() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_u_0_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 ; GFX900-NEXT: s_mov_b32 s10, s4 ; GFX900-NEXT: s_mov_b32 s11, s4 ; GFX900-NEXT: ;;#ASMSTART @@ -13893,14 +14067,15 @@ define void @s_shuffle_v4p3_v4p3__7_u_0_0() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_u_0_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 ; GFX90A-NEXT: s_mov_b32 s10, s4 ; GFX90A-NEXT: s_mov_b32 s11, s4 ; GFX90A-NEXT: ;;#ASMSTART @@ -13910,6 +14085,7 @@ define void @s_shuffle_v4p3_v4p3__7_u_0_0() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__7_u_0_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -14358,15 +14534,16 @@ define void @s_shuffle_v4p3_v4p3__7_7_0_0() { define void @s_shuffle_v4p3_v4p3__7_7_u_0() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_u_0: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s11, s4 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -14375,15 +14552,16 @@ define void @s_shuffle_v4p3_v4p3__7_7_u_0() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_u_0: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s11, s4 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -14392,6 +14570,7 @@ define void @s_shuffle_v4p3_v4p3__7_7_u_0() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__7_7_u_0: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -15070,14 +15249,15 @@ define void @s_shuffle_v4p3_v4p3__7_1_1_1() { define void @s_shuffle_v4p3_v4p3__7_u_1_1() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_u_1_1: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 ; GFX900-NEXT: s_mov_b32 s10, s5 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART @@ -15087,14 +15267,15 @@ define void @s_shuffle_v4p3_v4p3__7_u_1_1() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_u_1_1: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 ; GFX90A-NEXT: s_mov_b32 s10, s5 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART @@ -15104,6 +15285,7 @@ define void @s_shuffle_v4p3_v4p3__7_u_1_1() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__7_u_1_1: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -15552,15 +15734,16 @@ define void @s_shuffle_v4p3_v4p3__7_7_1_1() { define void @s_shuffle_v4p3_v4p3__7_7_u_1() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_u_1: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s11, s5 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -15569,15 +15752,16 @@ define void @s_shuffle_v4p3_v4p3__7_7_u_1() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_u_1: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s11, s5 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -15586,6 +15770,7 @@ define void @s_shuffle_v4p3_v4p3__7_7_u_1() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__7_7_u_1: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -16725,15 +16910,16 @@ define void @s_shuffle_v4p3_v4p3__7_7_2_2() { define void @s_shuffle_v4p3_v4p3__7_7_u_2() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_u_2: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ; def s[4:7] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ; def s[12:15] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: s_mov_b32 s8, s11 -; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s8, s15 +; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s11, s6 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] @@ -16742,15 +16928,16 @@ define void @s_shuffle_v4p3_v4p3__7_7_u_2() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_u_2: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ; def s[4:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ; def s[12:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: s_mov_b32 s8, s11 -; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s8, s15 +; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s11, s6 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] @@ -16759,6 +16946,7 @@ define void @s_shuffle_v4p3_v4p3__7_7_u_2() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__7_7_u_2: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -18358,6 +18546,7 @@ define void @s_shuffle_v4p3_v4p3__0_4_4_4() { define void @s_shuffle_v4p3_v4p3__1_4_4_4() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__1_4_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -18370,6 +18559,7 @@ define void @s_shuffle_v4p3_v4p3__1_4_4_4() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__1_4_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -18382,6 +18572,7 @@ define void @s_shuffle_v4p3_v4p3__1_4_4_4() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__1_4_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -18400,6 +18591,7 @@ define void @s_shuffle_v4p3_v4p3__1_4_4_4() { define void @s_shuffle_v4p3_v4p3__2_4_4_4() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__2_4_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -18412,6 +18604,7 @@ define void @s_shuffle_v4p3_v4p3__2_4_4_4() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__2_4_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -18424,6 +18617,7 @@ define void @s_shuffle_v4p3_v4p3__2_4_4_4() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__2_4_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -18442,6 +18636,7 @@ define void @s_shuffle_v4p3_v4p3__2_4_4_4() { define void @s_shuffle_v4p3_v4p3__3_4_4_4() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__3_4_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -18454,6 +18649,7 @@ define void @s_shuffle_v4p3_v4p3__3_4_4_4() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__3_4_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -18466,6 +18662,7 @@ define void @s_shuffle_v4p3_v4p3__3_4_4_4() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__3_4_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -18654,6 +18851,7 @@ define void @s_shuffle_v4p3_v4p3__7_4_4_4() { define void @s_shuffle_v4p3_v4p3__7_u_4_4() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_u_4_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -18668,6 +18866,7 @@ define void @s_shuffle_v4p3_v4p3__7_u_4_4() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_u_4_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -18682,6 +18881,7 @@ define void @s_shuffle_v4p3_v4p3__7_u_4_4() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__7_u_4_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -19100,6 +19300,7 @@ define void @s_shuffle_v4p3_v4p3__7_7_4_4() { define void @s_shuffle_v4p3_v4p3__7_7_u_4() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_u_4: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -19114,6 +19315,7 @@ define void @s_shuffle_v4p3_v4p3__7_7_u_4() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_u_4: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -19128,6 +19330,7 @@ define void @s_shuffle_v4p3_v4p3__7_7_u_4() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__7_7_u_4: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -19829,6 +20032,7 @@ define void @s_shuffle_v4p3_v4p3__7_5_5_5() { define void @s_shuffle_v4p3_v4p3__7_u_5_5() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_u_5_5: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -19843,6 +20047,7 @@ define void @s_shuffle_v4p3_v4p3__7_u_5_5() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_u_5_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -19857,6 +20062,7 @@ define void @s_shuffle_v4p3_v4p3__7_u_5_5() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__7_u_5_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -20275,6 +20481,7 @@ define void @s_shuffle_v4p3_v4p3__7_7_5_5() { define void @s_shuffle_v4p3_v4p3__7_7_u_5() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_u_5: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -20289,6 +20496,7 @@ define void @s_shuffle_v4p3_v4p3__7_7_u_5() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_u_5: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -20303,6 +20511,7 @@ define void @s_shuffle_v4p3_v4p3__7_7_u_5() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__7_7_u_5: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] @@ -21349,6 +21558,7 @@ define void @s_shuffle_v4p3_v4p3__7_7_6_6() { define void @s_shuffle_v4p3_v4p3__7_7_u_6() { ; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_u_6: ; GFX900: ; %bb.0: +; GFX900-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:7] @@ -21363,6 +21573,7 @@ define void @s_shuffle_v4p3_v4p3__7_7_u_6() { ; ; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_u_6: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:7] @@ -21377,6 +21588,7 @@ define void @s_shuffle_v4p3_v4p3__7_7_u_6() { ; ; GFX942-LABEL: s_shuffle_v4p3_v4p3__7_7_u_6: ; GFX942: ; %bb.0: +; GFX942-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:3] diff --git a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll index 4621be5cab450..458da8244d3f2 100644 --- a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll +++ b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll @@ -21,11 +21,12 @@ define amdgpu_kernel void @foobar(float %a0, float %a1, ptr addrspace(1) %out) # ; CHECK-NEXT: v_mov_b32_e32 v3, s7 ; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc ; CHECK-NEXT: ; %bb.1: ; %ift -; CHECK-NEXT: s_mov_b32 s4, s5 -; CHECK-NEXT: v_mov_b32_e32 v0, s4 -; CHECK-NEXT: v_mov_b32_e32 v1, s5 -; CHECK-NEXT: v_mov_b32_e32 v2, s6 -; CHECK-NEXT: v_mov_b32_e32 v3, s7 +; CHECK-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 +; CHECK-NEXT: s_mov_b32 s8, s5 +; CHECK-NEXT: v_mov_b32_e32 v0, s8 +; CHECK-NEXT: v_mov_b32_e32 v1, s9 +; CHECK-NEXT: v_mov_b32_e32 v2, s10 +; CHECK-NEXT: v_mov_b32_e32 v3, s11 ; CHECK-NEXT: ; %bb.2: ; %ife ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb diff --git a/llvm/test/CodeGen/AMDGPU/uniform-select.ll b/llvm/test/CodeGen/AMDGPU/uniform-select.ll index f001bf0d5e498..6ab1e688a3684 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-select.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-select.ll @@ -8,6 +8,7 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; GFX90A-LABEL: test_insert_extract: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX90A-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX90A-NEXT: s_mov_b32 s2, 0 ; GFX90A-NEXT: s_and_b64 vcc, exec, -1 ; GFX90A-NEXT: s_mov_b32 s3, 0 @@ -56,6 +57,7 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; GFX942-LABEL: test_insert_extract: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX942-NEXT: s_mov_b32 s2, 0 ; GFX942-NEXT: s_and_b64 vcc, exec, -1 ; GFX942-NEXT: s_mov_b32 s3, 0 @@ -110,6 +112,7 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; GFX1030-NEXT: s_mov_b32 s5, 0 ; GFX1030-NEXT: s_mov_b32 s6, 0 ; GFX1030-NEXT: s_mov_b32 vcc_lo, exec_lo +; GFX1030-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX1030-NEXT: .p2align 6 ; GFX1030-NEXT: .LBB0_1: ; %for.body ; GFX1030-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -158,6 +161,7 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; GFX1100-NEXT: s_mov_b32 s5, 0 ; GFX1100-NEXT: s_mov_b32 s6, 0 ; GFX1100-NEXT: s_mov_b32 vcc_lo, exec_lo +; GFX1100-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX1100-NEXT: .p2align 6 ; GFX1100-NEXT: .LBB0_1: ; %for.body ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll index 98919f565d902..4b4e2023dccc6 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll @@ -2475,7 +2475,9 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) { ; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, v0 ; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v1 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2] +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v6, v1 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[6:7] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2496,7 +2498,9 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) { ; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, v0 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2] +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, v1 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[6:7] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2534,8 +2538,10 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) { ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v2, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v4, v3, v[1:2] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v4, v3, v[6:7] ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v5, v2, v[3:4] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2555,11 +2561,13 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v2 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v4, v5, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v4, v3, v[1:2] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v6, v5, v[7:8] +; GFX11-GISEL-NEXT: v_mov_b32_e32 v7, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v4, v3, v[7:8] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v6, v5, v[9:10] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: test_vector_reduce_mul_v2i64: @@ -2614,12 +2622,15 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v2, 0 +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX7-GISEL-NEXT: v_mov_b32_e32 v6, v1 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v8 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[1:2] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v9, v8 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[9:10] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v4, 0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v2, v[8:9] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v5, v[1:2] +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v8, v1 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v5, v[8:9] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v4, v[5:6] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2642,12 +2653,15 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v2, 0 +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, v1 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v8 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[1:2] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v9, v8 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[9:10] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v4, 0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v2, v[8:9] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v5, v[1:2] +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v8, v1 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v5, v[8:9] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v4, v[5:6] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2694,13 +2708,17 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr10_vgpr11 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v7, v1 ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], s4, v6, v2, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v6, v3, v[9:10] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v10, v9 ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v4, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v7, v2, v[9:10] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v8, v5, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v2, v4, v[5:6] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v6, v3, v[10:11] +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v10, v1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v2, v[9:10] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v8, v5, v[10:11] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v1, v4, v[2:3] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: test_vector_reduce_mul_v3i64: @@ -2722,15 +2740,20 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1 +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr10_vgpr11 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v6, v2, 0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v10, v9 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v8, v4, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v6, v3, v[10:11] +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v9, v1 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v7, v2, v[12:13] ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v6, v3, v[9:10] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v7, v2, v[10:11] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v8, v5, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v11, v4, v[6:7] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v8, v5, v[9:10] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v13, v4, v[6:7] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: test_vector_reduce_mul_v3i64: @@ -2801,17 +2824,21 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v4, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v2, v6, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v2, v6, 0 +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr11_vgpr12 ; GFX7-GISEL-NEXT: v_mov_b32_e32 v8, v1 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v10 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v0, v5, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v2, v7, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v11, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v6, v[15:16] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v4, v[13:14] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v2, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v11, v[1:2] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v11, v10 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v0, v5, v[11:12] +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v14 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v2, v7, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v13, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v6, v[14:15] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v4, v[10:11] +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v2, v[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v13, v[1:2] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v4i64: @@ -2838,17 +2865,21 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v4, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v2, v6, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v2, v6, 0 +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr11_vgpr12 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v8, v1 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v10 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v0, v5, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v2, v7, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v11, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v6, v[15:16] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v4, v[13:14] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v2, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v11, v[1:2] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v11, v10 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v0, v5, v[11:12] +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v14 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v2, v7, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v13, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v6, v[14:15] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v4, v[10:11] +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v2, v[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v13, v[1:2] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v4i64: @@ -2908,15 +2939,19 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], s4, v2, v6, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v8, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v9, v1 +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr14_vgpr15 ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], s4, v8, v4, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v11 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], s4, v2, v7, v[0:1] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v13 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[16:17], s4, v2, v7, v[0:1] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v14, v13 ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v12, v10, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v8, v5, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v3, v6, v[14:15] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v8, v5, v[14:15] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v3, v6, v[16:17] +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v9, v4, v[7:8] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v12, v2, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v12, v2, v[5:6] ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v3, v10, v[1:2] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2945,21 +2980,26 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1 ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v2, v6, 0 +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr14_vgpr15 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v8, v4, 0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v11 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v2, v7, v[0:1] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, v13 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v14, v13 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[16:17], null, v2, v7, v[0:1] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v12, v10, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[15:16], null, v8, v5, v[2:3] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[17:18], null, v8, v5, v[14:15] ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v3, v6, v[14:15] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v9, v4, v[15:16] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v12, v7, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v5, v10, v[3:4] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v3, v6, v[16:17] +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v9, v4, v[17:18] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v12, v7, v[2:3] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v5, v10, v[8:9] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: test_vector_reduce_mul_v4i64: @@ -3060,30 +3100,39 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0 +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr18_vgpr19 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v18, v17 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v13, v[18:19] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v0, v8, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v13, v[17:18] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v12, v[17:18] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v19, v16, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v13 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v13, v20 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[13:14] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v4, v[5:6] +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr17_vgpr18 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v17, v13 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v4, v[17:18] +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr17_vgpr18 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v17, v20 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[17:18] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v8, v[17:18] +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v2, v10, 0 ; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v20 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v15, v[0:1] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v11, v[6:7] +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr20_vgpr21 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v20, v18 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v11, v[20:21] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v14, v[0:1] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v19, 0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v10, v[20:21] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v7 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v0, v[1:2] +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v9, v7 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v0, v[9:10] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v19, v[9:10] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v16, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v2, v[1:2] +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v2, v[4:5] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[1:2] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -3131,30 +3180,39 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0 +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr18_vgpr19 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v18, v17 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v13, v[18:19] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v0, v8, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v13, v[17:18] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v12, v[17:18] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v19, v16, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v13 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v13, v20 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[13:14] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v4, v[5:6] +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr17_vgpr18 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v17, v13 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v4, v[17:18] +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr17_vgpr18 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v17, v20 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[17:18] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v8, v[17:18] +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v2, v10, 0 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v20 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v15, v[0:1] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v11, v[6:7] +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr20_vgpr21 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v20, v18 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v11, v[20:21] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v14, v[0:1] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v19, 0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v10, v[20:21] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v7 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v0, v[1:2] +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v9, v7 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v0, v[9:10] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v19, v[9:10] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v16, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v2, v[1:2] +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v2, v[4:5] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[1:2] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -3262,32 +3320,40 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[17:18], s4, v0, v8, 0 ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[19:20], s4, v2, v10, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[23:24], s4, v6, v14, 0 +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr21_vgpr22 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[25:26], s4, v4, v12, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v21, v18 +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr27_vgpr28 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v16, v1 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[21:22], s4, v6, v14, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[23:24], s4, v4, v12, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v18 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v18, v20 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[25:26], s4, v0, v9, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v2, v11, v[18:19] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v22 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v24 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[29:30], s4, v4, v13, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[26:27], s4, v6, v15, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[27:28], s4, v19, v21, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[30:31], s4, v17, v23, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v3, v10, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v7, v14, v[26:27] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v28 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v5, v12, v[29:30] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, v31 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v19, v6, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v16, v8, v[25:26] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v30, v27, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v17, v3, v[4:5] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v2, v21, v[5:6] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v6, v23, v[3:4] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v30, v4, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v2, v27, v[3:4] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v27, v20 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[29:30], s4, v19, v23, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[20:21], s4, v0, v9, v[21:22] +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[21:22], s4, v2, v11, v[27:28] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v24 +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr27_vgpr28 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v27, v26 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v6, v15, v[0:1] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v4, v13, v[27:28] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v3, v10, v[21:22] +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[26:27], s4, v17, v25, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, v30 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v7, v14, v[0:1] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v5, v12, v[1:2] +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v26, v29, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v10, v27 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v19, v6, v[3:4] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v16, v8, v[20:21] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v17, v9, v[10:11] +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v2, v23, v[3:4] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v4, v25, v[5:6] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v26, v2, v[6:7] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v3, v29, v[1:2] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3333,37 +3399,47 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[17:18], null, v0, v8, 0 ; GFX11-GISEL-NEXT: v_mad_u64_u32 v[19:20], null, v2, v10, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[21:22], null, v6, v14, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[23:24], null, v4, v12, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v16, v1 :: v_dual_mov_b32 v1, v18 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v18, v20 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[25:26], null, v0, v9, v[1:2] -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, v22 :: v_dual_mov_b32 v1, v24 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[26:27], null, v2, v11, v[18:19] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[27:28], null, v6, v15, v[0:1] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[30:31], null, v4, v13, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[28:29], null, v19, v21, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[31:32], null, v17, v23, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[23:24], null, v6, v14, 0 +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr21_vgpr22 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v16, v1 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[25:26], null, v4, v12, 0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v21, v18 +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr27_vgpr28 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v27, v20 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[29:30], null, v0, v9, v[21:22] +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[20:21], null, v2, v11, v[27:28] +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v24 +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr21_vgpr22 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v21, v26 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v7, v14, v[27:28] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v3, v10, v[26:27] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v5, v12, v[30:31] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, v29 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, v32 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v16, v8, v[25:26] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v19, v0, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v31, v28, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v17, v2, v[3:4] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[26:27], null, v6, v15, v[0:1] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[27:28], null, v19, v23, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v4, v13, v[21:22] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[21:22], null, v17, v25, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v6, v21, v[4:5] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v9, v23, v[7:8] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v7, v14, v[26:27] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v3, v10, v[20:21] +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, v28 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v5, v12, v[0:1] +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, v22 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v19, v1, v[2:3] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v21, v27, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v16, v8, v[29:30] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v17, v9, v[4:5] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v6, v23, v[10:11] +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v2, v25, v[7:8] ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v31, v2, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v3, v28, v[4:5] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v21, v3, v[4:5] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v8, v27, v[6:7] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3542,63 +3618,82 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v8, v24, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v25, v[32:33] +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr33_vgpr34 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v33, v32 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v25, v[33:34] +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr34_vgpr35 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v24, v[32:33] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v16, 0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v24, v31, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v9, v33 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v24, v8, v[9:10] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v24, v25 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v17, v[24:25] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v34, v33 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v24, v8, v[34:35] +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr33_vgpr34 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v33, v25 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v17, v[33:34] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v16, v[24:25] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v18, 0 +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr16_vgpr17 +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr24_vgpr25 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v31, v[8:9] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v27, v[1:2] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v16, v1 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v27, v[16:17] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v18, 0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v26, v[9:10] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v0, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v16, v9, v[1:2] +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr26_vgpr27 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v26, v17 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v24, v11 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v16, v9, v[24:25] ; GFX7-GISEL-NEXT: buffer_load_dword v9, off, s[0:3], s32 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v17 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v19, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v19, v[26:27] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[1:2] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v0, v[24:25] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v20, 0 +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr24_vgpr25 ; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v29, v[0:1] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v2, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v18 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v21, v[3:4] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v12 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v24, v12 +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr12_vgpr13 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v12, v18 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v21, v[12:13] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v14, v30, 0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v20, v[3:4] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v0, v[1:2] +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v0, v[24:25] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, v13 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v6, v22, 0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v2, v[0:1] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v18 ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v9, v[4:5] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v12, 0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v15, v30, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v23, v[3:4] +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, v18 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v23, v[4:5] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v2, v14 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v17, v1, v[2:3] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v22, v[3:4] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v32, v11, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v22, v[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v13, 0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v12, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v32, v0, v[5:6] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v1, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v32, v11, 0 +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr14_vgpr15 +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v14, v7 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v1, v[14:15] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v2, v5 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v32, v0, v[2:3] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v6, 0 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v11, v[5:6] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v9, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v11, v[2:3] +; GFX7-GISEL-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v9, v[7:8] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v6, v[3:4] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v16i64: @@ -3687,63 +3782,82 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v8, v24, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v25, v[32:33] +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr33_vgpr34 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v33, v32 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v25, v[33:34] +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr34_vgpr35 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v24, v[32:33] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v16, 0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v24, v31, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v9, v33 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v24, v8, v[9:10] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v24, v25 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v17, v[24:25] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v34, v33 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v24, v8, v[34:35] +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr33_vgpr34 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v33, v25 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v17, v[33:34] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v16, v[24:25] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v18, 0 +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr16_vgpr17 +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr24_vgpr25 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v31, v[8:9] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v27, v[1:2] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v16, v1 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v27, v[16:17] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v18, 0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v26, v[9:10] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v0, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v16, v9, v[1:2] +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr26_vgpr27 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v26, v17 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v24, v11 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v16, v9, v[24:25] ; GFX8-GISEL-NEXT: buffer_load_dword v9, off, s[0:3], s32 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v17 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v19, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v19, v[26:27] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[1:2] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v0, v[24:25] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v20, 0 +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr24_vgpr25 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v29, v[0:1] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v2, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v18 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v21, v[3:4] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v12 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v24, v12 +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr12_vgpr13 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v12, v18 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v21, v[12:13] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v14, v30, 0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v20, v[3:4] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v0, v[1:2] +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v0, v[24:25] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, v13 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v6, v22, 0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v2, v[0:1] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v18 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v9, v[4:5] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v12, 0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v15, v30, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v23, v[3:4] +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, v18 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v23, v[4:5] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, v14 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v17, v1, v[2:3] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v22, v[3:4] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v32, v11, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v22, v[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v13, 0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v12, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v32, v0, v[5:6] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v1, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v32, v11, 0 +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr14_vgpr15 +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v14, v7 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v1, v[14:15] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, v5 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v32, v0, v[2:3] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v6, 0 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v11, v[5:6] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v9, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v11, v[2:3] +; GFX8-GISEL-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v9, v[7:8] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v6, v[3:4] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v16i64: @@ -3951,65 +4065,82 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[31:32], s4, v0, v16, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[33:34], s4, v2, v18, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[38:39], s4, v6, v22, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[35:36], s4, v0, v17, v[32:33] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[36:37], s4, v4, v20, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v34 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[16:17], s4, v1, v16, v[35:36] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v2, v19, v[0:1] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v37 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v39 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[34:35], s4, v8, v24, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[48:49], s4, v4, v21, v[1:2] +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr33_vgpr34 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[35:36], s4, v2, v18, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[37:38], s4, v4, v20, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[48:49], s4, v6, v22, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v33, v32 +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr51_vgpr52 +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr53_vgpr54 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[32:33], s4, v0, v17, v[33:34] +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr33_vgpr34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v33, v36 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[16:17], s4, v1, v16, v[32:33] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v2, v19, v[33:34] +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr32_vgpr33 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v38 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v32, v49 ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[49:50], s4, v10, v26, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v6, v23, v[2:3] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v35 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v5, v20, v[48:49] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[51:52], s4, v8, v25, v[2:3] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v50 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[52:53], s4, v10, v27, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[53:54], s4, v12, v28, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v9, v24, v[51:52] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v54 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], s4, v11, v26, v[52:53] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[19:20], s4, v36, v53, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[54:55], s4, v12, v29, v[2:3] -; GFX10-GISEL-NEXT: buffer_load_dword v12, off, s[0:3], s32 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[38:39], s4, v8, v24, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v4, v21, v[1:2] +; GFX10-GISEL-NEXT: buffer_load_dword v21, off, s[0:3], s32 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[32:33], s4, v6, v23, v[32:33] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v51, v50 +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr33_vgpr34 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v33, v39 ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v3, v18, v[0:1] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[50:51], s4, v10, v27, v[51:52] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[51:52], s4, v12, v28, 0 ; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v14, v30, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[17:18], s4, v33, v49, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v7, v22, v[1:2] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v31, v34, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v5, v20, v[1:2] +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[33:34], s4, v8, v25, v[33:34] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v53, v52 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[19:20], s4, v35, v49, 0 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v5, v4 +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr17_vgpr18 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[52:53], s4, v12, v29, v[53:54] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v7, v22, v[32:33] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], s4, v9, v24, v[33:34] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], s4, v11, v26, v[50:51] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v31, v38, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], s4, v13, v28, v[52:53] +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr24_vgpr25 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v17, v10 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[17:18], s4, v31, v8, v[17:18] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v14, v12, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], s4, v38, v3, 0 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v8 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], s4, v13, v28, v[54:55] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], s4, v15, v30, v[0:1] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v18 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[8:9], s4, v31, v9, v[1:2] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v33, v10, v[0:1] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v20 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], s4, v38, v14, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[20:21], s4, v17, v11, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v36, v13, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[12:13], s4, v7, v19, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v6, v3, v[14:15] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v14, v21, v[5:6] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v48, v3, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], s4, v37, v51, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[21:22], s4, v15, v30, v[4:5] +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr22_vgpr23 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v24, v6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v22, v20 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[10:11], s4, v35, v11, v[22:23] +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr22_vgpr23 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v22, v14 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[14:15], s4, v48, v21, v[24:25] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[20:21], s4, v19, v5, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[11:12], s4, v37, v12, v[22:23] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[22:23], s4, v9, v13, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v7, v3, v[14:15] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[6:7], s4, v2, v49, v[10:11] +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v0, v51, v[11:12] ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v21 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[9:10], s4, v2, v49, v[9:10] -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v13 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v5, v53, v[0:1] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[5:6], s4, v17, v3, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v12, v20, 0 -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v7, v4, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[13:14], s4, v16, v34, v[8:9] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v9, v11, v[5:6] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v13, v19, v[2:3] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v12, v3, v[1:2] -; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v4, v20, v[1:2] +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v10, v23 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v19, v3, v[1:2] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[3:4], s4, v16, v38, v[17:18] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[0:1], s4, v22, v20, 0 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[7:8], s4, v9, v7, v[10:11] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[4:5], s4, v6, v5, v[2:3] +; GFX10-GISEL-NEXT: ; implicit-def: $vgpr5_vgpr6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v3, v13, v[7:8] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[2:3], s4, v22, v4, v[5:6] +; GFX10-GISEL-NEXT: v_mad_u64_u32 v[1:2], s4, v1, v20, v[2:3] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: test_vector_reduce_mul_v16i64: @@ -4090,66 +4221,90 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v16i64: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: scratch_load_b32 v71, off, s32 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[31:32], null, v0, v16, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[33:34], null, v2, v18, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[35:36], null, v4, v20, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[37:38], null, v6, v22, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[50:51], null, v10, v26, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[52:53], null, v12, v28, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[48:49], null, v8, v24, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[54:55], null, v14, v30, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[82:83], null, v0, v17, v[32:33] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[83:84], null, v2, v19, v[34:35] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[84:85], null, v4, v21, v[36:37] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[85:86], null, v6, v23, v[38:39] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[86:87], null, v10, v27, v[51:52] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[65:66], null, v31, v48, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[38:39], null, v8, v25, v[49:50] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v64, v55 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[96:97], null, v12, v29, v[53:54] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[97:98], null, v1, v16, v[82:83] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v3, v18, v[83:84] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v5, v20, v[84:85] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v7, v22, v[85:86] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[67:68], null, v33, v50, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[80:81], null, v37, v54, 0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v66 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[69:70], null, v35, v52, 0 +; GFX11-GISEL-NEXT: scratch_load_b32 v39, off, s32 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[68:69], null, v0, v16, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[70:71], null, v2, v18, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[80:81], null, v4, v20, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[82:83], null, v6, v22, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[84:85], null, v8, v24, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[86:87], null, v10, v26, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[98:99], null, v14, v30, 0 +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr31_vgpr32 +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr33_vgpr34 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[96:97], null, v12, v28, 0 +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr35_vgpr36 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v31, v69 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v33, v71 +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr37_vgpr38 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v35, v81 +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr48_vgpr49 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v37, v83 :: v_dual_mov_b32 v48, v85 +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr50_vgpr51 +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr54_vgpr55 +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr52_vgpr53 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[101:102], null, v70, v86, 0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v50, v87 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[116:117], null, v0, v17, v[31:32] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[31:32], null, v2, v19, v[33:34] +; GFX11-GISEL-NEXT: v_mov_b32_e32 v54, v99 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[32:33], null, v4, v21, v[35:36] +; GFX11-GISEL-NEXT: v_mov_b32_e32 v52, v97 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[33:34], null, v6, v23, v[37:38] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[34:35], null, v8, v25, v[48:49] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[35:36], null, v10, v27, v[50:51] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[36:37], null, v12, v29, v[52:53] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[99:100], null, v68, v84, 0 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[37:38], null, v1, v16, v[116:117] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v3, v18, v[31:32] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v5, v20, v[32:33] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v7, v22, v[33:34] +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr64_vgpr65 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[114:115], null, v82, v98, 0 +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr66_vgpr67 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[112:113], null, v80, v96, 0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v64, v100 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v66, v102 ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v14, v71, v[64:65] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v9, v24, v[38:39] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v11, v26, v[86:87] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v13, v28, v[96:97] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v15, v30, v[4:5] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, v68 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v31, v5, v[0:1] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v81 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v33, v6, v[4:5] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, v70 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v37, v8, v[0:1] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v67, v80, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v35, v7, v[4:5] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v65, v69, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[14:15], null, v3, v54, v[5:6] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v1, v50, v[10:11] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v2, v52, v[13:14] -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, v7 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v67, v14, v[0:1] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v97, v48, v[9:10] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v6, v11, 0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v65, v4, v[2:3] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v3, v80, v[7:8] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v12, v69, v[8:9] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v6, v4, v[1:2] -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v9, v11, v[7:8] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v14, v39, v[54:55] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v9, v24, v[34:35] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v11, v26, v[35:36] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v13, v28, v[36:37] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v15, v30, v[3:4] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v68, v4, v[64:65] +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v70, v5, v[66:67] +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, v115 +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v10, v113 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v82, v7, v[3:4] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v80, v6, v[10:11] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v101, v114, 0 +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr14_vgpr15 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[5:6], null, v99, v112, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[10:11], null, v2, v98, v[12:13] +; GFX11-GISEL-NEXT: v_mov_b32_e32 v14, v4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[16:17], null, v1, v96, v[13:14] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[11:12], null, v0, v86, v[9:10] +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr12_vgpr13 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v5, v3, 0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v12, v6 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v101, v10, v[14:15] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[9:10], null, v37, v84, v[8:9] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[7:8], null, v99, v16, v[12:13] +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[12:13], null, v11, v114, v[6:7] +; GFX11-GISEL-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[13:14], null, v9, v112, v[7:8] +; GFX11-GISEL-NEXT: v_mov_b32_e32 v10, v1 +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v5, v12, v[10:11] +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u64_u32 v[1:2], null, v13, v3, v[6:7] ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: test_vector_reduce_mul_v16i64: diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll index b5d9d00c48045..53785250b0f98 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll @@ -64,9 +64,14 @@ define i8 @test_vector_reduce_smax_v2i8(<2 x i8> %v) { ; ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v2i8: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v3, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v2, 0, 8 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -91,13 +96,18 @@ define i8 @test_vector_reduce_smax_v2i8(<2 x i8> %v) { ; ; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v2i8: ; GFX12-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v3, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v2, 0, 8 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -207,14 +217,20 @@ define i8 @test_vector_reduce_smax_v3i8(<3 x i8> %v) { ; ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v3i8: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v3, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v4, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -245,18 +261,24 @@ define i8 @test_vector_reduce_smax_v3i8(<3 x i8> %v) { ; ; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v3i8: ; GFX12-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v3, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v4, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-SDAG-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -446,19 +468,30 @@ define i8 @test_vector_reduce_smax_v4i8(<4 x i8> %v) { ; ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v4i8: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr5 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v5, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v1.l, v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v1, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v3, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v4, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v0, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l ; GFX11-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v0.l, v1.l, v2.l ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -516,23 +549,34 @@ define i8 @test_vector_reduce_smax_v4i8(<4 x i8> %v) { ; ; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v4i8: ; GFX12-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr5 ; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v5, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-TRUE16-NEXT: v_max_i16 v1.l, v1.l, v3.l +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v1, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v3, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v4, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v0, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l ; GFX12-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v0.l, v1.l, v2.l ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -795,36 +839,52 @@ define i8 @test_vector_reduce_smax_v8i8(<8 x i8> %v) { ; ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v8i8: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr8 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr9 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr10 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v5, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v3, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v7, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v8.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr5 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v10.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v7.l +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v9, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v8, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr9 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v1, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v5, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v7.l +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr7 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l ; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v1.l, v1.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v1.l, v5.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v7, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l +; GFX11-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v1.l, v5.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v10, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v9, 0, 8 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v4, v4, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v0.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v5 +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v7, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l ; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v5, 0, 8 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v0.l, v1.l, v2.l ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l @@ -901,40 +961,56 @@ define i8 @test_vector_reduce_smax_v8i8(<8 x i8> %v) { ; ; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v8i8: ; GFX12-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr8 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr9 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr10 ; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v5, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v3, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v7, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v8.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr5 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v10.l, v0.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v7.l +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v9, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v8, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr9 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v1, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v5, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v7.l +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr7 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l ; GFX12-SDAG-TRUE16-NEXT: v_max_i16 v1.l, v1.l, v3.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v1.l, v5.l, v3.l -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v7, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l +; GFX12-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v1.l, v5.l, v3.l +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v10, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v9, 0, 8 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v4, v4, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v0.l ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v5 +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v7, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l ; GFX12-SDAG-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v1.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v5, 0, 8 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v0.l, v1.l, v2.l ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l @@ -1306,55 +1382,86 @@ define i8 @test_vector_reduce_smax_v16i8(<16 x i8> %v) { ; ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v16i8: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr16 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr18 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr20 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr19 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v16.l, v12.l +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr12 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v18.l, v10.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.l +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr10 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v20.l, v6.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v13.l +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v13, v16, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v10.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v12, v12, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l ; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v16, v4, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v5.l +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr5 ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.l -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v17, v0, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.l -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v9, v4, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr9 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v17, v4, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v8.l +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr8 ; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v4, v5, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v9, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr5 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v9, v1, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.l -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v13, v13, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v18.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v0, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v11.l +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr11 +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v8, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v2, 0, 8 ; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v5, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v11, v3, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v1, 0, 8 ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v11, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.l +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v20, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v11.l ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l -; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v9.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v13.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v17.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v16.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v0.h, v2.l, v3.l ; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v1.h, v4.l, v5.l -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v8, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v7, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v8.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v0.l, v1.l, v6.l -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v12, v12, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v9.l, v14.l -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v10, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v18, 0, 8 ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.l +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v10, 0, 8 ; GFX11-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v0.l, v1.h, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v17.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v18, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v4, v9, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v4, v19, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v0.h, v2.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v12.l ; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v0.l ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l -; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v0.h, v2.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v16.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v12.l -; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 8, v7 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v13.l ; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v1.l, v1.l, v4.l -; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v0.l, v5.l, v0.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 8, v7 +; GFX11-SDAG-TRUE16-NEXT: v_max_i16 v0.l, v5.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-SDAG-TRUE16-NEXT: v_max3_i16 v0.h, v0.h, v2.l, v3.l ; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v6, 0, 8 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -1462,59 +1569,90 @@ define i8 @test_vector_reduce_smax_v16i8(<16 x i8> %v) { ; ; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v16i8: ; GFX12-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr16 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr18 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr20 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr19 ; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v16.l, v12.l +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr12 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v18.l, v10.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.l +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr10 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v20.l, v6.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v13.l +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v13, v16, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v10.l, v2.l +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v12, v12, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l ; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v16, v4, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v5.l +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr5 ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.l -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v17, v0, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.l -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v9, v4, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr9 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v17, v4, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v8.l +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr8 ; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v4, v5, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v9, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr5 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v9, v1, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.l -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v13, v13, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v18.l, v2.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v0, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v11.l +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr11 +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v8, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v2, 0, 8 ; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v5, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v11, v3, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v1, 0, 8 ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v11, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.l +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v20, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v1.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v11.l ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l -; GFX12-SDAG-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v1.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v9.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v13.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v17.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v16.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-SDAG-TRUE16-NEXT: v_max_i16 v0.h, v2.l, v3.l ; GFX12-SDAG-TRUE16-NEXT: v_max_i16 v1.h, v4.l, v5.l -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v8, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v7, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v8.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX12-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v0.l, v1.l, v6.l -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v12, v12, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v9.l, v14.l -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v10, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v18, 0, 8 ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.l +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v10, 0, 8 ; GFX12-SDAG-TRUE16-NEXT: v_max3_i16 v0.l, v0.l, v1.h, v0.h -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v17.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v18, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v4, v9, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v4, v19, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: v_max_i16 v0.h, v2.l, v3.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v12.l ; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v0.l ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l -; GFX12-SDAG-TRUE16-NEXT: v_max_i16 v0.h, v2.l, v3.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v16.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v12.l -; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 8, v7 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v13.l ; GFX12-SDAG-TRUE16-NEXT: v_max_i16 v1.l, v1.l, v4.l -; GFX12-SDAG-TRUE16-NEXT: v_max_i16 v0.l, v5.l, v0.l ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 8, v7 +; GFX12-SDAG-TRUE16-NEXT: v_max_i16 v0.l, v5.l, v0.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-SDAG-TRUE16-NEXT: v_max3_i16 v0.h, v0.h, v2.l, v3.l ; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v6, 0, 8 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll index 2a989ecd2ebad..4eccee8e816e6 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll @@ -64,9 +64,14 @@ define i8 @test_vector_reduce_smin_v2i8(<2 x i8> %v) { ; ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v2i8: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v3, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v2, 0, 8 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -91,13 +96,18 @@ define i8 @test_vector_reduce_smin_v2i8(<2 x i8> %v) { ; ; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v2i8: ; GFX12-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v3, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v2, 0, 8 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -207,14 +217,20 @@ define i8 @test_vector_reduce_smin_v3i8(<3 x i8> %v) { ; ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v3i8: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v3, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v4, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-SDAG-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -245,18 +261,24 @@ define i8 @test_vector_reduce_smin_v3i8(<3 x i8> %v) { ; ; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v3i8: ; GFX12-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr4 ; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v3, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v4, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-SDAG-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -446,19 +468,30 @@ define i8 @test_vector_reduce_smin_v4i8(<4 x i8> %v) { ; ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v4i8: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr5 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v5, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-TRUE16-NEXT: v_min_i16 v1.l, v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v1, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v3, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v4, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v0, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l ; GFX11-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v0.l, v1.l, v2.l ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -516,23 +549,34 @@ define i8 @test_vector_reduce_smin_v4i8(<4 x i8> %v) { ; ; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v4i8: ; GFX12-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr5 ; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v5, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-TRUE16-NEXT: v_min_i16 v1.l, v1.l, v3.l +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr3 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v1, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v3, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v4, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v0, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l ; GFX12-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v0.l, v1.l, v2.l ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -795,36 +839,52 @@ define i8 @test_vector_reduce_smin_v8i8(<8 x i8> %v) { ; ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v8i8: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr8 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr9 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr10 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v5, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v3, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v7, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v8.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr5 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v10.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v7.l +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v9, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v8, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr9 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v1, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v5, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v7.l +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr7 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l ; GFX11-SDAG-TRUE16-NEXT: v_min_i16 v1.l, v1.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v1.l, v5.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v7, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l +; GFX11-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v1.l, v5.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v10, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v9, 0, 8 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v4, v4, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v0.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v5 +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v7, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l ; GFX11-SDAG-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v5, 0, 8 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v0.l, v1.l, v2.l ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l @@ -901,40 +961,56 @@ define i8 @test_vector_reduce_smin_v8i8(<8 x i8> %v) { ; ; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v8i8: ; GFX12-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr8 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr9 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr10 ; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v5, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v3, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v7, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v8.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr5 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v10.l, v0.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v7.l +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v9, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v8, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr9 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v1, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v5, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v7.l +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr7 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l ; GFX12-SDAG-TRUE16-NEXT: v_min_i16 v1.l, v1.l, v3.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v1.l, v5.l, v3.l -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v7, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l +; GFX12-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v1.l, v5.l, v3.l +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v10, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v9, 0, 8 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v4, v4, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v0.l ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v5 +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v7, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l ; GFX12-SDAG-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v1.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v5, 0, 8 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v0.l, v1.l, v2.l ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l @@ -1306,55 +1382,86 @@ define i8 @test_vector_reduce_smin_v16i8(<16 x i8> %v) { ; ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v16i8: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr16 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr18 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr20 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr19 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v16.l, v12.l +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr12 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v18.l, v10.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.l +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr10 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v20.l, v6.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v13.l +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v13, v16, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v10.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v12, v12, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l ; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v16, v4, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v5.l +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr5 ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.l -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v17, v0, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.l -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v9, v4, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr9 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v17, v4, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v8.l +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr8 ; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v4, v5, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v9, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr5 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v9, v1, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.l -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v13, v13, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v18.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v0, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v11.l +; GFX11-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr11 +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v8, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v2, 0, 8 ; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v5, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v11, v3, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v1, 0, 8 ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v11, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.l +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v20, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v11.l ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l -; GFX11-SDAG-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v9.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v13.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v17.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v16.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-SDAG-TRUE16-NEXT: v_min_i16 v0.h, v2.l, v3.l ; GFX11-SDAG-TRUE16-NEXT: v_min_i16 v1.h, v4.l, v5.l -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v8, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v7, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v8.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v0.l, v1.l, v6.l -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v12, v12, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v9.l, v14.l -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v10, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v18, 0, 8 ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.l +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v10, 0, 8 ; GFX11-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v0.l, v1.h, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v17.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v18, 0, 8 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v4, v9, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v4, v19, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_min_i16 v0.h, v2.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v12.l ; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v0.l ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l -; GFX11-SDAG-TRUE16-NEXT: v_min_i16 v0.h, v2.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v16.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v12.l -; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 8, v7 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v13.l ; GFX11-SDAG-TRUE16-NEXT: v_min_i16 v1.l, v1.l, v4.l -; GFX11-SDAG-TRUE16-NEXT: v_min_i16 v0.l, v5.l, v0.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 8, v7 +; GFX11-SDAG-TRUE16-NEXT: v_min_i16 v0.l, v5.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-SDAG-TRUE16-NEXT: v_min3_i16 v0.h, v0.h, v2.l, v3.l ; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v6, 0, 8 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -1462,59 +1569,90 @@ define i8 @test_vector_reduce_smin_v16i8(<16 x i8> %v) { ; ; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v16i8: ; GFX12-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr16 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr18 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr20 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr19 ; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v16.l, v12.l +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr12 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v18.l, v10.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.l +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr10 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v20.l, v6.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v13.l +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v13, v16, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v10.l, v2.l +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr2 +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v12, v12, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l ; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v16, v4, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr4 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v5.l +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr5 ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.l -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v17, v0, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.l -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v9, v4, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr9 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr1 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v17, v4, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v8.l +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr8 ; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v4, v5, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v9, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr5 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v9, v1, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.l -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v13, v13, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v18.l, v2.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v0, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v11.l +; GFX12-SDAG-TRUE16-NEXT: ; implicit-def: $vgpr11 +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v8, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v2, 0, 8 ; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v5, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v11, v3, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v1, 0, 8 ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v3, v11, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.l +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v20, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v1.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v11.l ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l -; GFX12-SDAG-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v1.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v9.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v13.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v17.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v16.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-SDAG-TRUE16-NEXT: v_min_i16 v0.h, v2.l, v3.l ; GFX12-SDAG-TRUE16-NEXT: v_min_i16 v1.h, v4.l, v5.l -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v8, v8, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v7, v7, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v8.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX12-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v0.l, v1.l, v6.l -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v12, v12, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v9.l, v14.l -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v10, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v6, v18, 0, 8 ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.l +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v10, 0, 8 ; GFX12-SDAG-TRUE16-NEXT: v_min3_i16 v0.l, v0.l, v1.h, v0.h -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v17.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v5, v18, 0, 8 -; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v4, v9, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v4, v19, 0, 8 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: v_min_i16 v0.h, v2.l, v3.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v12.l ; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v0.l ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l -; GFX12-SDAG-TRUE16-NEXT: v_min_i16 v0.h, v2.l, v3.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v16.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v12.l -; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 8, v7 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v13.l ; GFX12-SDAG-TRUE16-NEXT: v_min_i16 v1.l, v1.l, v4.l -; GFX12-SDAG-TRUE16-NEXT: v_min_i16 v0.l, v5.l, v0.l ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 8, v7 +; GFX12-SDAG-TRUE16-NEXT: v_min_i16 v0.l, v5.l, v0.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-SDAG-TRUE16-NEXT: v_min3_i16 v0.h, v0.h, v2.l, v3.l ; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v6, 0, 8 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) diff --git a/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll b/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll index 07e9325095017..3018cd3d20d7f 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll @@ -454,6 +454,7 @@ define <2 x i16> @shuffle_v2i16_rebroadcast(ptr addrspace(1) %arg0) { ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -498,6 +499,7 @@ define <4 x i16> @shuffle_v4i16_rebroadcast(ptr addrspace(1) %arg0) { ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -549,6 +551,7 @@ define <8 x i16> @shuffle_v8i16_rebroadcast(ptr addrspace(1) %arg0) { ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -612,6 +615,7 @@ define <16 x i16> @shuffle_v16i16_rebroadcast(ptr addrspace(1) %arg0) { ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -699,6 +703,7 @@ define <32 x i16> @shuffle_v32i16_rebroadcast(ptr addrspace(1) %arg0) { ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -1119,6 +1124,7 @@ define <3 x bfloat> @shuffle_v3bf16_rebroadcast(ptr addrspace(1) %arg0) { ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h @@ -1518,6 +1524,7 @@ define <3 x half> @shuffle_v3f16_rebroadcast(ptr addrspace(1) %arg0) { ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll index b01e92d6979a3..5874b667ce5e2 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -209,10 +209,11 @@ define <4 x half> @shuffle_v4f16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-TRUE16-LABEL: shuffle_v4f16_3u6u: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off offset:4 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -268,10 +269,11 @@ define <4 x half> @shuffle_v4f16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-TRUE16-LABEL: shuffle_v4f16_3uu7: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off offset:4 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -389,9 +391,12 @@ define <4 x half> @shuffle_v4f16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[2:3], off +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: shuffle_v4f16_357u: @@ -2628,6 +2633,7 @@ define <2 x i16> @i16_hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v2, v[2:3], off ; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -3505,10 +3511,11 @@ define <4 x bfloat> @shuffle_v4bf16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-TRUE16-LABEL: shuffle_v4bf16_3u6u: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off offset:4 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3564,10 +3571,11 @@ define <4 x bfloat> @shuffle_v4bf16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-TRUE16-LABEL: shuffle_v4bf16_3uu7: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: global_load_b32 v1, v[2:3], off offset:4 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3685,9 +3693,12 @@ define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:4 ; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v[2:3], off +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: shuffle_v4bf16_357u: diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll index 53bede84513c9..c9830010b8056 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll @@ -237,11 +237,20 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off -; GFX12-NEXT: s_endpgm +; GFX12-TRUE16-LABEL: test_swmmac_f32_16x16x32_f16_negA: +; GFX12-TRUE16: ; %bb.0: ; %bb +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr13 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v13 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-TRUE16-NEXT: global_store_b128 v[11:12], v[6:9], off +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: test_swmmac_f32_16x16x32_f16_negA: +; GFX12-FAKE16: ; %bb.0: ; %bb +; GFX12-FAKE16-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-FAKE16-NEXT: global_store_b128 v[11:12], v[6:9], off +; GFX12-FAKE16-NEXT: s_endpgm bb: %fneg.A = fneg <4 x half> %A %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index) @@ -250,11 +259,20 @@ bb: } define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off -; GFX12-NEXT: s_endpgm +; GFX12-TRUE16-LABEL: test_swmmac_f32_16x16x32_f16_negB: +; GFX12-TRUE16: ; %bb.0: ; %bb +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr13 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v13 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-TRUE16-NEXT: global_store_b128 v[11:12], v[6:9], off +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: test_swmmac_f32_16x16x32_f16_negB: +; GFX12-FAKE16: ; %bb.0: ; %bb +; GFX12-FAKE16-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-FAKE16-NEXT: global_store_b128 v[11:12], v[6:9], off +; GFX12-FAKE16-NEXT: s_endpgm bb: %fneg.B = fneg <8 x half> %B %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index) @@ -263,11 +281,20 @@ bb: } define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off -; GFX12-NEXT: s_endpgm +; GFX12-TRUE16-LABEL: test_swmmac_f16_16x16x32_f16_negA: +; GFX12-TRUE16: ; %bb.0: ; %bb +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr11 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v11 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-TRUE16-NEXT: global_store_b64 v[9:10], v[6:7], off +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: test_swmmac_f16_16x16x32_f16_negA: +; GFX12-FAKE16: ; %bb.0: ; %bb +; GFX12-FAKE16-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX12-FAKE16-NEXT: global_store_b64 v[9:10], v[6:7], off +; GFX12-FAKE16-NEXT: s_endpgm bb: %fneg.A = fneg <4 x half> %A %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index) @@ -276,11 +303,20 @@ bb: } define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off -; GFX12-NEXT: s_endpgm +; GFX12-TRUE16-LABEL: test_swmmac_f16_16x16x32_f16_negB: +; GFX12-TRUE16: ; %bb.0: ; %bb +; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr11 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v11 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-TRUE16-NEXT: global_store_b64 v[9:10], v[6:7], off +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: test_swmmac_f16_16x16x32_f16_negB: +; GFX12-FAKE16: ; %bb.0: ; %bb +; GFX12-FAKE16-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX12-FAKE16-NEXT: global_store_b64 v[9:10], v[6:7], off +; GFX12-FAKE16-NEXT: s_endpgm bb: %fneg.B = fneg <8 x half> %B %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index)