diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index eb9aabf8b6317..3a3f303293461 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1510,76 +1510,128 @@ bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const { // only accept VGPR or inline immediate. Recreate a reg_sequence with its // initializers right here, so we will rematerialize immediates and avoid // copies via different reg classes. - if (!TRI->isAGPR(*MRI, CopyMI->getOperand(0).getReg())) + const TargetRegisterClass *DefRC = + MRI->getRegClass(CopyMI->getOperand(0).getReg()); + if (!TRI->isAGPRClass(DefRC)) return false; + Register UseReg = CopyMI->getOperand(1).getReg(); - SmallVector, 32> Defs; - if (!getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) + MachineInstr *RegSeq = MRI->getVRegDef(UseReg); + if (!RegSeq || !RegSeq->isRegSequence()) return false; const DebugLoc &DL = CopyMI->getDebugLoc(); MachineBasicBlock &MBB = *CopyMI->getParent(); + MachineInstrBuilder B(*MBB.getParent(), CopyMI); + DenseMap VGPRCopies; + SmallSetVector SeenInputs; + + const TargetRegisterClass *UseRC = + MRI->getRegClass(CopyMI->getOperand(1).getReg()); + + // Value, subregindex for new REG_SEQUENCE + SmallVector, 32> NewDefs; + + unsigned NumRegSeqOperands = RegSeq->getNumOperands(); + unsigned NumFoldable = 0; + + for (unsigned I = 1; I != NumRegSeqOperands; I += 2) { + MachineOperand &RegOp = RegSeq->getOperand(I); + unsigned SubRegIdx = RegSeq->getOperand(I + 1).getImm(); + + if (RegOp.getSubReg()) { + // TODO: Handle subregister compose + NewDefs.emplace_back(&RegOp, SubRegIdx); + continue; + } + + MachineOperand *Lookup = lookUpCopyChain(*TII, *MRI, RegOp.getReg()); + if (!Lookup) + Lookup = &RegOp; + + if (Lookup->isImm()) { + // Check if this is an agpr_32 subregister. + const TargetRegisterClass *DestSuperRC = TRI->getMatchingSuperRegClass( + DefRC, &AMDGPU::AGPR_32RegClass, SubRegIdx); + if (DestSuperRC && + TII->isInlineConstant(*Lookup, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { + ++NumFoldable; + NewDefs.emplace_back(Lookup, SubRegIdx); + continue; + } + } + + const TargetRegisterClass *InputRC = + Lookup->isReg() ? MRI->getRegClass(Lookup->getReg()) + : MRI->getRegClass(RegOp.getReg()); + + // TODO: Account for Lookup->getSubReg() + + // If we can't find a matching super class, this is an SGPR->AGPR or + // VGPR->AGPR subreg copy (or something constant-like we have to materialize + // in the AGPR). We can't directly copy from SGPR to AGPR on gfx908, so we + // want to rewrite to copy to an intermediate VGPR class. + const TargetRegisterClass *MatchRC = + TRI->getMatchingSuperRegClass(DefRC, InputRC, SubRegIdx); + if (!MatchRC) { + ++NumFoldable; + NewDefs.emplace_back(&RegOp, SubRegIdx); + continue; + } + + NewDefs.emplace_back(&RegOp, SubRegIdx); + } + + // Do not clone a reg_sequence and merely change the result register class. + if (NumFoldable == 0) + return false; + CopyMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE)); for (unsigned I = CopyMI->getNumOperands() - 1; I > 0; --I) CopyMI->removeOperand(I); - MachineInstrBuilder B(*MBB.getParent(), CopyMI); - DenseMap VGPRCopies; - SmallSetVector SeenAGPRs; - for (unsigned I = 0, NumElts = Defs.size(); I != NumElts; ++I) { - MachineOperand *Def = Defs[I].first; - TargetInstrInfo::RegSubRegPair CopyToVGPR; - if (Def->isImm() && - TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { - int64_t Imm = Def->getImm(); - - auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); + for (auto [Def, DestSubIdx] : NewDefs) { + if (!Def->isReg()) { + // TODO: Should we use single write for each repeated value like in + // register case? + Register Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp) - .addImm(Imm); + .add(*Def); B.addReg(Tmp); - } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) { - auto Src = getRegSubRegPair(*Def); + } else { + TargetInstrInfo::RegSubRegPair Src = getRegSubRegPair(*Def); Def->setIsKill(false); - if (!SeenAGPRs.insert(Src)) { + + Register &VGPRCopy = VGPRCopies[Src]; + if (!VGPRCopy) { + const TargetRegisterClass *VGPRUseSubRC = + TRI->getSubRegisterClass(UseRC, DestSubIdx); + // We cannot build a reg_sequence out of the same registers, they // must be copied. Better do it here before copyPhysReg() created // several reads to do the AGPR->VGPR->AGPR copy. - CopyToVGPR = Src; - } else { - B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0, Src.SubReg); - } - } else { - assert(Def->isReg()); - Def->setIsKill(false); - auto Src = getRegSubRegPair(*Def); - // Direct copy from SGPR to AGPR is not possible. To avoid creation - // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later, - // create a copy here and track if we already have such a copy. - if (TRI->isSGPRReg(*MRI, Src.Reg)) { - CopyToVGPR = Src; + // Direct copy from SGPR to AGPR is not possible on gfx908. To avoid + // creation of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() + // later, create a copy here and track if we already have such a copy. + if (TRI->getSubRegisterClass(MRI->getRegClass(Src.Reg), Src.SubReg) != + VGPRUseSubRC) { + VGPRCopy = MRI->createVirtualRegister(VGPRUseSubRC); + BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), VGPRCopy).add(*Def); + B.addReg(VGPRCopy); + } else { + // If it is already a VGPR, do not copy the register. + B.add(*Def); + } } else { - auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); - BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def); - B.addReg(Tmp); + B.addReg(VGPRCopy); } } - if (CopyToVGPR.Reg) { - auto [It, Inserted] = VGPRCopies.try_emplace(CopyToVGPR); - Register &Vgpr = It->second; - if (Inserted) { - Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def); - } - Register Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); - BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), Tmp).addReg(Vgpr); - B.addReg(Tmp); - } - - B.addImm(Defs[I].second); + B.addImm(DestSubIdx); } + LLVM_DEBUG(dbgs() << "Folded " << *CopyMI); return true; } @@ -1634,6 +1686,13 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy( foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI)) return true; + // Fold copy to AGPR through reg_sequence + // TODO: Handle with subregister extract + if (OpToFold.isReg() && MI.isCopy() && !MI.getOperand(1).getSubReg()) { + if (foldCopyToAGPRRegSequence(&MI)) + return true; + } + bool Changed = foldInstOperand(MI, OpToFold); // If we managed to fold all uses of this copy then we might as well diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll index a31064e293622..3f5a99cad9543 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll @@ -16,13 +16,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) # ; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GCN-NEXT: s_mov_b64 s[36:37], 1 ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[36:37], s[36:37] op_sel:[0,1] -; GCN-NEXT: s_mov_b32 s36, 2 -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[36:37], s[36:37] op_sel:[0,1] +; GCN-NEXT: s_mov_b32 s38, 2 +; GCN-NEXT: s_mov_b32 s39, s37 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0 ; GCN-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[38:39], s[38:39] op_sel:[0,1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_accvgpr_write_b32 a0, s0 +; GCN-NEXT: v_accvgpr_write_b32 a16, s16 ; GCN-NEXT: v_accvgpr_write_b32 a1, s1 ; GCN-NEXT: v_accvgpr_write_b32 a2, s2 ; GCN-NEXT: v_accvgpr_write_b32 a3, s3 @@ -38,7 +40,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) # ; GCN-NEXT: v_accvgpr_write_b32 a13, s13 ; GCN-NEXT: v_accvgpr_write_b32 a14, s14 ; GCN-NEXT: v_accvgpr_write_b32 a15, s15 -; GCN-NEXT: v_accvgpr_write_b32 a16, s16 ; GCN-NEXT: v_accvgpr_write_b32 a17, s17 ; GCN-NEXT: v_accvgpr_write_b32 a18, s18 ; GCN-NEXT: v_accvgpr_write_b32 a19, s19 @@ -317,31 +318,29 @@ bb: define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, double %a, double %b) #0 { ; GCN-LABEL: test_mfma_f64_16x16x4f64_imm: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 -; GCN-NEXT: s_mov_b64 s[0:1], 0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x34 ; GCN-NEXT: s_mov_b64 s[6:7], 1.0 -; GCN-NEXT: s_mov_b64 s[2:3], s[0:1] +; GCN-NEXT: s_mov_b64 s[8:9], 0 +; GCN-NEXT: v_accvgpr_write_b32 a0, s8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1] -; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] -; GCN-NEXT: v_accvgpr_write_b32 a0, s0 -; GCN-NEXT: v_accvgpr_write_b32 a1, s1 -; GCN-NEXT: v_accvgpr_write_b32 a2, s2 -; GCN-NEXT: v_accvgpr_write_b32 a3, s3 -; GCN-NEXT: v_accvgpr_write_b32 a4, s4 -; GCN-NEXT: v_accvgpr_write_b32 a5, s5 +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GCN-NEXT: v_accvgpr_write_b32 a2, s8 +; GCN-NEXT: v_accvgpr_write_b32 a4, s8 ; GCN-NEXT: v_accvgpr_write_b32 a6, s6 +; GCN-NEXT: v_accvgpr_write_b32 a1, s9 +; GCN-NEXT: v_accvgpr_write_b32 a3, s9 +; GCN-NEXT: v_accvgpr_write_b32 a5, s9 ; GCN-NEXT: v_accvgpr_write_b32 a7, s7 -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[10:11], s[10:11] op_sel:[0,1] ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 0 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9] -; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16 +; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GCN-NEXT: s_endpgm bb: %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> , i32 0, i32 0, i32 0) @@ -352,32 +351,29 @@ bb: define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %arg, double %a, double %b) #0 { ; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_lit: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_mov_b32 s1, 0x405ec000 -; GCN-NEXT: s_mov_b64 s[2:3], s[0:1] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0x405ec000 +; GCN-NEXT: v_accvgpr_write_b32 a0, s6 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1] -; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] -; GCN-NEXT: s_mov_b64 s[6:7], s[0:1] -; GCN-NEXT: v_accvgpr_write_b32 a0, s0 -; GCN-NEXT: v_accvgpr_write_b32 a1, s1 -; GCN-NEXT: v_accvgpr_write_b32 a2, s2 -; GCN-NEXT: v_accvgpr_write_b32 a3, s3 -; GCN-NEXT: v_accvgpr_write_b32 a4, s4 -; GCN-NEXT: v_accvgpr_write_b32 a5, s5 +; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GCN-NEXT: v_accvgpr_write_b32 a2, s6 +; GCN-NEXT: v_accvgpr_write_b32 a4, s6 ; GCN-NEXT: v_accvgpr_write_b32 a6, s6 +; GCN-NEXT: v_accvgpr_write_b32 a1, s7 +; GCN-NEXT: v_accvgpr_write_b32 a3, s7 +; GCN-NEXT: v_accvgpr_write_b32 a5, s7 ; GCN-NEXT: v_accvgpr_write_b32 a7, s7 -; GCN-NEXT: v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1] +; GCN-NEXT: v_pk_mov_b32 v[2:3], s[8:9], s[8:9] op_sel:[0,1] ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 0 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9] -; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16 +; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GCN-NEXT: s_endpgm bb: %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> , i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll index 5c484e1e52da8..726bfbab7ad48 100644 --- a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll +++ b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll @@ -190,11 +190,11 @@ bb: ; NB: for atomics both vdata and vdst shall be either VGPR or AGPR ; GCN-LABEL: {{^}}test_atomic_mfma_4xi32_atomic_store: +; GCN: v_accvgpr_write_b32 [[A_ZERO:a[0-9]+]], 0 ; GCN: global_atomic_sub [[IN:v[0-9]+]], v{{[0-9:]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}] glc +; GCN-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, [[A_ZERO]] +; GCN-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, [[A_ZERO]] ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, [[IN]] -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} ; GCN: v_mfma_i32_4x4x4i8 a[[[N:[0-9]+]]: ; GCN: v_accvgpr_read_b32 [[V:v[0-9]+]], a[[N]]{{$}} ; GCN: global_atomic_add v{{[0-9]+}}, v{{[0-9:]+}}, [[V]], s[{{[0-9:]+}}] glc @@ -217,7 +217,10 @@ bb: ; GCN-LABEL: {{^}}test_atomic_mfma_4xi32_atomic64_store: ; GCN: global_atomic_sub_x2 v[{{[0-9:]+}}], v{{[0-9:]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}] glc -; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 [[A_ZERO:a[0-9]+]], 0 +; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, [[A_ZERO]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} ; GCN: v_mfma_i32_4x4x4i8 a[[[N:[0-9]+]]: ; GCN: v_accvgpr_read_b32 v{{[0-9]+}}, a{{[0-9]+}} ; GCN: v_accvgpr_read_b32 v{{[0-9]+}}, a{{[0-9]+}} diff --git a/llvm/test/CodeGen/AMDGPU/coalesces-better.mir b/llvm/test/CodeGen/AMDGPU/coalesces-better.mir new file mode 100644 index 0000000000000..593220d879c2a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/coalesces-better.mir @@ -0,0 +1,74 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 -start-after=si-fold-operands -o - %s | FileCheck %s + +--- | + target triple = "amdgcn-mesa-mesa3d" + + define <4 x float> @test_smfmac_f32_16x16x64_f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) #0 { + ; CHECK-LABEL: test_smfmac_f32_16x16x64_f16: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + ; CHECK-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 + ; CHECK-NEXT: s_nop 7 + ; CHECK-NEXT: v_mov_b32_e32 v0, v12 + ; CHECK-NEXT: v_mov_b32_e32 v1, v13 + ; CHECK-NEXT: v_mov_b32_e32 v2, v14 + ; CHECK-NEXT: v_mov_b32_e32 v3, v15 + ; CHECK-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) + ret <4 x float> %result + } + + ; Function Attrs: convergent nocallback nofree nosync nounwind willreturn memory(none) + declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half>, <16 x half>, <4 x float>, i32, i32 immarg, i32 immarg) #1 + + attributes #0 = { "target-cpu"="gfx950" } + attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx950" } + +... +--- +name: test_smfmac_f32_16x16x64_f16 +tracksRegLiveness: true +isSSA: true +machineFunctionInfo: + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16 + + + %0:vgpr_32 = COPY $vgpr16 + %1:vgpr_32 = COPY $vgpr15 + %2:vgpr_32 = COPY $vgpr14 + %3:vgpr_32 = COPY $vgpr13 + %4:vgpr_32 = COPY $vgpr12 + %5:vgpr_32 = COPY $vgpr11 + %6:vgpr_32 = COPY $vgpr10 + %7:vgpr_32 = COPY $vgpr9 + %8:vgpr_32 = COPY $vgpr8 + %9:vgpr_32 = COPY $vgpr7 + %10:vgpr_32 = COPY $vgpr6 + %11:vgpr_32 = COPY $vgpr5 + %12:vgpr_32 = COPY $vgpr4 + %13:vgpr_32 = COPY $vgpr3 + %14:vgpr_32 = COPY $vgpr2 + %15:vgpr_32 = COPY $vgpr1 + %16:vgpr_32 = COPY $vgpr0 + %17:vreg_256_align2 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3, %8, %subreg.sub4, %7, %subreg.sub5, %6, %subreg.sub6, %5, %subreg.sub7 + %18:vreg_128_align2 = REG_SEQUENCE %4, %subreg.sub0, %3, %subreg.sub1, %2, %subreg.sub2, %1, %subreg.sub3 + %19:vreg_128_align2 = REG_SEQUENCE %16, %subreg.sub0, %15, %subreg.sub1, %14, %subreg.sub2, %13, %subreg.sub3 + %24:areg_128_align2 = COPY %18 + %25:areg_128_align2 = V_SMFMAC_F32_16X16X64_F16_e64 %19, %17, %0, 0, 0, %24, implicit $mode, implicit $exec + %26:vgpr_32 = COPY %25.sub0 + %27:vgpr_32 = COPY %25.sub1 + %28:vgpr_32 = COPY %25.sub2 + %29:vgpr_32 = COPY %25.sub3 + $vgpr0 = COPY %26 + $vgpr1 = COPY %27 + $vgpr2 = COPY %28 + $vgpr3 = COPY %29 + SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + +... diff --git a/llvm/test/CodeGen/AMDGPU/coalesces-worse.mir b/llvm/test/CodeGen/AMDGPU/coalesces-worse.mir new file mode 100644 index 0000000000000..0718f825fbacb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/coalesces-worse.mir @@ -0,0 +1,71 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 -start-after=si-fold-operands -o - %s | FileCheck %s + +--- | + target triple = "amdgcn-mesa-mesa3d" + + define <4 x float> @test_smfmac_f32_16x16x64_f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) #0 { + ; CHECK-LABEL: test_smfmac_f32_16x16x64_f16: + ; CHECK: ; %bb.0: + ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + ; CHECK-NEXT: v_accvgpr_write_b32 a0, v12 + ; CHECK-NEXT: v_accvgpr_write_b32 a1, v13 + ; CHECK-NEXT: v_accvgpr_write_b32 a2, v14 + ; CHECK-NEXT: v_accvgpr_write_b32 a3, v15 + ; CHECK-NEXT: s_nop 1 + ; CHECK-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 + ; CHECK-NEXT: s_nop 7 + ; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 + ; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 + ; CHECK-NEXT: v_accvgpr_read_b32 v2, a2 + ; CHECK-NEXT: v_accvgpr_read_b32 v3, a3 + ; CHECK-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) + ret <4 x float> %result + } + + declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half>, <16 x half>, <4 x float>, i32, i32 immarg, i32 immarg) #1 + + attributes #0 = { "target-cpu"="gfx950" } + attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx950" } + +... +--- +name: test_smfmac_f32_16x16x64_f16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16 + + %0:vgpr_32 = COPY $vgpr16 + %1:vgpr_32 = COPY $vgpr15 + %2:vgpr_32 = COPY $vgpr14 + %3:vgpr_32 = COPY $vgpr13 + %4:vgpr_32 = COPY $vgpr12 + %5:vgpr_32 = COPY $vgpr11 + %6:vgpr_32 = COPY $vgpr10 + %7:vgpr_32 = COPY $vgpr9 + %8:vgpr_32 = COPY $vgpr8 + %9:vgpr_32 = COPY $vgpr7 + %10:vgpr_32 = COPY $vgpr6 + %11:vgpr_32 = COPY $vgpr5 + %12:vgpr_32 = COPY $vgpr4 + %13:vgpr_32 = COPY $vgpr3 + %14:vgpr_32 = COPY $vgpr2 + %15:vgpr_32 = COPY $vgpr1 + %16:vgpr_32 = COPY $vgpr0 + %17:vreg_256_align2 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3, %8, %subreg.sub4, %7, %subreg.sub5, %6, %subreg.sub6, %5, %subreg.sub7 + %18:vreg_128_align2 = REG_SEQUENCE %4, %subreg.sub0, %3, %subreg.sub1, %2, %subreg.sub2, %1, %subreg.sub3 + %19:vreg_128_align2 = REG_SEQUENCE %16, %subreg.sub0, %15, %subreg.sub1, %14, %subreg.sub2, %13, %subreg.sub3 + %20:areg_128_align2 = V_SMFMAC_F32_16X16X64_F16_e64 %19, %17, %0, 0, 0, %18, implicit $mode, implicit $exec + %21:vgpr_32 = COPY %20.sub0 + %22:vgpr_32 = COPY %20.sub1 + %23:vgpr_32 = COPY %20.sub2 + %24:vgpr_32 = COPY %20.sub3 + $vgpr0 = COPY %21 + $vgpr1 = COPY %22 + $vgpr2 = COPY %23 + $vgpr3 = COPY %24 + SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + +... diff --git a/llvm/test/CodeGen/AMDGPU/fold-agpr-phis.mir b/llvm/test/CodeGen/AMDGPU/fold-agpr-phis.mir index a9b3eaf4c33a3..7e81b78ac378c 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-agpr-phis.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-agpr-phis.mir @@ -796,3 +796,39 @@ body: | S_ENDPGM 0 ... + + + + +--- +name: phi_output_reg_type_is_vgpr +tracksRegLiveness: true +body: | + bb.0: + liveins: $scc + successors: %bb.1 + + %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1:sgpr_32 = S_MOV_B32 0 + %2:sgpr_128 = REG_SEQUENCE %1, %subreg.sub0, %1, %subreg.sub1, %1, %subreg.sub2, %1, %subreg.sub3 + %3:vreg_128 = COPY %2 + %4:sreg_64 = S_MOV_B64 0 + %5:areg_128_align2 = COPY %3, implicit $exec + + bb.1: + liveins: $scc + successors: %bb.1, %bb.2 + + %9:areg_128_align2 = PHI %5, %bb.0, %10, %bb.1 + %11:areg_128_align2 = V_MFMA_F32_16X16X4F32_e64 %0:vgpr_32, %0:vgpr_32, %9:areg_128_align2, 0, 0, 0, implicit $mode, implicit $exec + %12:vgpr_32 = COPY %11.sub3 + %13:vgpr_32 = COPY %11.sub2 + %14:vgpr_32 = COPY %11.sub1 + %15:vgpr_32 = COPY %11.sub0 + %10:areg_128_align2 = COPY %11, implicit $exec + S_CBRANCH_SCC1 %bb.1, implicit $scc + + bb.2: + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll index 25b857f8f47dd..7cc726a3bd79c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll @@ -1494,10 +1494,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr ; SDAG-NEXT: v_mov_b32_e32 v18, s18 ; SDAG-NEXT: v_mov_b32_e32 v19, s19 ; SDAG-NEXT: v_mov_b32_e32 v20, s28 -; SDAG-NEXT: v_mov_b32_e32 v23, v1 -; SDAG-NEXT: v_mov_b32_e32 v22, v0 ; SDAG-NEXT: v_mov_b32_e32 v21, s29 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v20 ; SDAG-NEXT: v_mov_b32_e32 v4, s20 ; SDAG-NEXT: v_mov_b32_e32 v5, s21 ; SDAG-NEXT: v_mov_b32_e32 v6, s22 @@ -1506,9 +1503,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr ; SDAG-NEXT: v_mov_b32_e32 v9, s25 ; SDAG-NEXT: v_mov_b32_e32 v10, s26 ; SDAG-NEXT: v_mov_b32_e32 v11, s27 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v20 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v0 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v1 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[12:19], v[4:11], a[0:3], v2, v3 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 @@ -1531,17 +1529,15 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b32_e32 v20, s28 -; GISEL-NEXT: v_mov_b32_e32 v22, v0 -; GISEL-NEXT: v_mov_b32_e32 v23, v1 ; GISEL-NEXT: v_mov_b32_e32 v21, s29 ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_accvgpr_write_b32 a0, v20 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; GISEL-NEXT: v_accvgpr_write_b32 a0, v20 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v23 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v0 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v1 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v3 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 @@ -1667,7 +1663,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 ; SDAG-NEXT: v_mov_b32_e32 v14, s0 ; SDAG-NEXT: v_mov_b32_e32 v15, s1 ; SDAG-NEXT: v_mov_b32_e32 v16, s2 @@ -1676,6 +1671,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; SDAG-NEXT: v_mov_b32_e32 v19, s17 ; SDAG-NEXT: v_mov_b32_e32 v20, s18 ; SDAG-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 @@ -1697,10 +1693,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; GISEL-NEXT: s_mov_b32 s14, s2 ; GISEL-NEXT: s_mov_b32 s15, s3 ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19] -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 @@ -1817,8 +1813,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: s_movk_i32 s0, 0x41 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 @@ -1857,8 +1853,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_kimm: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: s_movk_i32 s0, 0x41 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 @@ -2365,4 +2361,4 @@ declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } -attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } \ No newline at end of file +attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll index 3d959393a8fa7..dac54c9f85e96 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll @@ -3515,56 +3515,42 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v32, s0 -; SDAG-NEXT: v_mov_b32_e32 v33, s1 -; SDAG-NEXT: v_mov_b32_e32 v34, s2 -; SDAG-NEXT: v_mov_b32_e32 v35, s3 -; SDAG-NEXT: v_mov_b32_e32 v36, s16 -; SDAG-NEXT: v_mov_b32_e32 v37, s17 -; SDAG-NEXT: v_mov_b32_e32 v38, s18 -; SDAG-NEXT: v_mov_b32_e32 v39, s19 -; SDAG-NEXT: v_mov_b32_e32 v16, s28 -; SDAG-NEXT: v_mov_b32_e32 v31, v13 -; SDAG-NEXT: v_mov_b32_e32 v30, v12 -; SDAG-NEXT: v_mov_b32_e32 v29, v11 -; SDAG-NEXT: v_mov_b32_e32 v28, v10 -; SDAG-NEXT: v_mov_b32_e32 v27, v9 -; SDAG-NEXT: v_mov_b32_e32 v26, v8 -; SDAG-NEXT: v_mov_b32_e32 v25, v7 -; SDAG-NEXT: v_mov_b32_e32 v24, v6 -; SDAG-NEXT: v_mov_b32_e32 v23, v5 -; SDAG-NEXT: v_mov_b32_e32 v22, v4 -; SDAG-NEXT: v_mov_b32_e32 v21, v3 -; SDAG-NEXT: v_mov_b32_e32 v20, v2 -; SDAG-NEXT: v_mov_b32_e32 v19, v1 -; SDAG-NEXT: v_mov_b32_e32 v18, v0 -; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_mov_b32_e32 v0, s20 -; SDAG-NEXT: v_mov_b32_e32 v1, s21 -; SDAG-NEXT: v_mov_b32_e32 v2, s22 -; SDAG-NEXT: v_mov_b32_e32 v3, s23 -; SDAG-NEXT: v_mov_b32_e32 v4, s24 -; SDAG-NEXT: v_mov_b32_e32 v5, s25 -; SDAG-NEXT: v_mov_b32_e32 v6, s26 -; SDAG-NEXT: v_mov_b32_e32 v7, s27 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v31 +; SDAG-NEXT: v_mov_b32_e32 v24, s0 +; SDAG-NEXT: v_mov_b32_e32 v25, s1 +; SDAG-NEXT: v_mov_b32_e32 v26, s2 +; SDAG-NEXT: v_mov_b32_e32 v27, s3 +; SDAG-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-NEXT: v_mov_b32_e32 v29, s17 +; SDAG-NEXT: v_mov_b32_e32 v30, s18 +; SDAG-NEXT: v_mov_b32_e32 v31, s19 +; SDAG-NEXT: v_mov_b32_e32 v32, s28 +; SDAG-NEXT: v_mov_b32_e32 v33, s29 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: v_mov_b32_e32 v20, s24 +; SDAG-NEXT: v_mov_b32_e32 v21, s25 +; SDAG-NEXT: v_mov_b32_e32 v22, s26 +; SDAG-NEXT: v_mov_b32_e32 v23, s27 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v32 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v33 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v0 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v1 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v2 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v3 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v4 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v5 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v6 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v7 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v8 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v13 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[24:31], v[16:23], a[0:15], v14, v15 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 @@ -3593,48 +3579,34 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; GISEL-NEXT: s_mov_b32 s13, s1 ; GISEL-NEXT: s_mov_b32 s14, s2 ; GISEL-NEXT: s_mov_b32 s15, s3 -; GISEL-NEXT: v_mov_b32_e32 v18, v0 -; GISEL-NEXT: v_mov_b32_e32 v19, v1 -; GISEL-NEXT: v_mov_b32_e32 v20, v2 -; GISEL-NEXT: v_mov_b32_e32 v21, v3 -; GISEL-NEXT: v_mov_b32_e32 v22, v4 -; GISEL-NEXT: v_mov_b32_e32 v23, v5 -; GISEL-NEXT: v_mov_b32_e32 v24, v6 -; GISEL-NEXT: v_mov_b32_e32 v25, v7 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v16, s28 -; GISEL-NEXT: v_mov_b32_e32 v26, v8 -; GISEL-NEXT: v_mov_b32_e32 v27, v9 -; GISEL-NEXT: v_mov_b32_e32 v28, v10 -; GISEL-NEXT: v_mov_b32_e32 v29, v11 -; GISEL-NEXT: v_mov_b32_e32 v30, v12 -; GISEL-NEXT: v_mov_b32_e32 v31, v13 -; GISEL-NEXT: v_mov_b32_e32 v17, s29 -; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[26:27] -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[20:21] -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v31 +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13] +; GISEL-NEXT: v_mov_b32_e32 v32, s28 +; GISEL-NEXT: v_mov_b32_e32 v33, s29 +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[20:21] +; GISEL-NEXT: v_accvgpr_write_b32 a0, v32 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v33 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v0 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v1 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v2 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v3 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v4 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v5 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v6 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v7 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v13 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[32:39], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[24:31], a[0:15], v14, v15 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 @@ -3871,7 +3843,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 ; SDAG-NEXT: v_mov_b32_e32 v26, s0 ; SDAG-NEXT: v_mov_b32_e32 v27, s1 ; SDAG-NEXT: v_mov_b32_e32 v28, s2 @@ -3880,6 +3851,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; SDAG-NEXT: v_mov_b32_e32 v31, s17 ; SDAG-NEXT: v_mov_b32_e32 v32, s18 ; SDAG-NEXT: v_mov_b32_e32 v33, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 ; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 @@ -3926,10 +3898,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; GISEL-NEXT: s_mov_b32 s14, s2 ; GISEL-NEXT: s_mov_b32 s15, s3 ; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19] -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 ; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13] +; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 ; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 ; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 ; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 @@ -3972,93 +3944,47 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp } define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) { -; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, s28 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s12, s0 -; GISEL-NEXT: s_mov_b32 s13, s1 -; GISEL-NEXT: s_mov_b32 s14, s2 -; GISEL-NEXT: s_mov_b32 s15, s3 -; GISEL-NEXT: v_accvgpr_write_b32 a0, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a1, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s23 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s24 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s25 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s26 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s27 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, s28 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: v_accvgpr_read_b32 v4, a4 -; GISEL-NEXT: v_accvgpr_read_b32 v5, a5 -; GISEL-NEXT: v_accvgpr_read_b32 v6, a6 -; GISEL-NEXT: v_accvgpr_read_b32 v7, a7 -; GISEL-NEXT: v_accvgpr_read_b32 v8, a8 -; GISEL-NEXT: v_accvgpr_read_b32 v9, a9 -; GISEL-NEXT: v_accvgpr_read_b32 v10, a10 -; GISEL-NEXT: v_accvgpr_read_b32 v11, a11 -; GISEL-NEXT: v_accvgpr_read_b32 v12, a12 -; GISEL-NEXT: v_accvgpr_read_b32 v13, a13 -; GISEL-NEXT: v_accvgpr_read_b32 v14, a14 -; GISEL-NEXT: v_accvgpr_read_b32 v15, a15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, s0 +; GCN-NEXT: v_accvgpr_write_b32 a1, s1 +; GCN-NEXT: v_accvgpr_write_b32 a2, s2 +; GCN-NEXT: v_accvgpr_write_b32 a3, s3 +; GCN-NEXT: v_accvgpr_write_b32 a4, s16 +; GCN-NEXT: v_accvgpr_write_b32 a5, s17 +; GCN-NEXT: v_accvgpr_write_b32 a6, s18 +; GCN-NEXT: v_accvgpr_write_b32 a7, s19 +; GCN-NEXT: v_accvgpr_write_b32 a8, s20 +; GCN-NEXT: v_accvgpr_write_b32 a9, s21 +; GCN-NEXT: v_accvgpr_write_b32 a10, s22 +; GCN-NEXT: v_accvgpr_write_b32 a11, s23 +; GCN-NEXT: v_accvgpr_write_b32 a12, s24 +; GCN-NEXT: v_accvgpr_write_b32 a13, s25 +; GCN-NEXT: v_accvgpr_write_b32 a14, s26 +; GCN-NEXT: v_accvgpr_write_b32 a15, s27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, s28 op_sel_hi:[0,0,0] +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 3 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1) ret <16 x float> %result } @@ -4067,48 +3993,42 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v32, s0 -; SDAG-NEXT: v_mov_b32_e32 v33, s1 -; SDAG-NEXT: v_mov_b32_e32 v34, s2 -; SDAG-NEXT: v_mov_b32_e32 v35, s3 -; SDAG-NEXT: v_mov_b32_e32 v36, s16 -; SDAG-NEXT: v_mov_b32_e32 v37, s17 -; SDAG-NEXT: v_mov_b32_e32 v38, s18 -; SDAG-NEXT: v_mov_b32_e32 v39, s19 -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v31, v13 -; SDAG-NEXT: v_mov_b32_e32 v30, v12 -; SDAG-NEXT: v_mov_b32_e32 v29, v11 -; SDAG-NEXT: v_mov_b32_e32 v28, v10 -; SDAG-NEXT: v_mov_b32_e32 v27, v9 -; SDAG-NEXT: v_mov_b32_e32 v26, v8 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 -; SDAG-NEXT: v_mov_b32_e32 v20, s24 -; SDAG-NEXT: v_mov_b32_e32 v21, s25 -; SDAG-NEXT: v_mov_b32_e32 v22, s26 -; SDAG-NEXT: v_mov_b32_e32 v23, s27 -; SDAG-NEXT: v_mov_b32_e32 v24, s28 -; SDAG-NEXT: v_mov_b32_e32 v25, s29 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v31 +; SDAG-NEXT: v_mov_b32_e32 v16, s0 +; SDAG-NEXT: v_mov_b32_e32 v17, s1 +; SDAG-NEXT: v_mov_b32_e32 v18, s2 +; SDAG-NEXT: v_mov_b32_e32 v19, s3 +; SDAG-NEXT: v_mov_b32_e32 v20, s16 +; SDAG-NEXT: v_mov_b32_e32 v21, s17 +; SDAG-NEXT: v_mov_b32_e32 v22, s18 +; SDAG-NEXT: v_mov_b32_e32 v23, s19 +; SDAG-NEXT: v_mov_b32_e32 v24, s20 +; SDAG-NEXT: v_mov_b32_e32 v25, s21 +; SDAG-NEXT: v_mov_b32_e32 v26, s22 +; SDAG-NEXT: v_mov_b32_e32 v27, s23 +; SDAG-NEXT: v_mov_b32_e32 v28, s24 +; SDAG-NEXT: v_mov_b32_e32 v29, s25 +; SDAG-NEXT: v_mov_b32_e32 v30, s26 +; SDAG-NEXT: v_mov_b32_e32 v31, s27 +; SDAG-NEXT: v_mov_b32_e32 v32, s28 +; SDAG-NEXT: v_mov_b32_e32 v33, s29 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v25 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v26 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v27 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v28 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v29 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v30 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v31 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v32 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v33 +; SDAG-NEXT: v_accvgpr_write_b32 a10, v8 +; SDAG-NEXT: v_accvgpr_write_b32 a11, v9 +; SDAG-NEXT: v_accvgpr_write_b32 a12, v10 +; SDAG-NEXT: v_accvgpr_write_b32 a13, v11 +; SDAG-NEXT: v_accvgpr_write_b32 a14, v12 +; SDAG-NEXT: v_accvgpr_write_b32 a15, v13 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 @@ -4137,44 +4057,38 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; GISEL-NEXT: s_mov_b32 s13, s1 ; GISEL-NEXT: s_mov_b32 s14, s2 ; GISEL-NEXT: s_mov_b32 s15, s3 -; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[12:13] -; GISEL-NEXT: v_mov_b32_e32 v16, s20 -; GISEL-NEXT: v_mov_b32_e32 v26, v8 -; GISEL-NEXT: v_mov_b32_e32 v27, v9 -; GISEL-NEXT: v_mov_b32_e32 v28, v10 -; GISEL-NEXT: v_mov_b32_e32 v29, v11 -; GISEL-NEXT: v_mov_b32_e32 v30, v12 -; GISEL-NEXT: v_mov_b32_e32 v31, v13 -; GISEL-NEXT: v_mov_b32_e32 v17, s21 -; GISEL-NEXT: v_mov_b32_e32 v18, s22 -; GISEL-NEXT: v_mov_b32_e32 v19, s23 -; GISEL-NEXT: v_mov_b32_e32 v20, s24 -; GISEL-NEXT: v_mov_b32_e32 v21, s25 -; GISEL-NEXT: v_mov_b32_e32 v22, s26 -; GISEL-NEXT: v_mov_b32_e32 v23, s27 -; GISEL-NEXT: v_mov_b32_e32 v24, s28 -; GISEL-NEXT: v_mov_b32_e32 v25, s29 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v16 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v17 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v18 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v19 -; GISEL-NEXT: v_accvgpr_write_b32 a4, v20 -; GISEL-NEXT: v_accvgpr_write_b32 a5, v21 -; GISEL-NEXT: v_accvgpr_write_b32 a6, v22 -; GISEL-NEXT: v_accvgpr_write_b32 a7, v23 -; GISEL-NEXT: v_accvgpr_write_b32 a8, v24 -; GISEL-NEXT: v_accvgpr_write_b32 a9, v25 -; GISEL-NEXT: v_accvgpr_write_b32 a10, v26 -; GISEL-NEXT: v_accvgpr_write_b32 a11, v27 -; GISEL-NEXT: v_accvgpr_write_b32 a12, v28 -; GISEL-NEXT: v_accvgpr_write_b32 a13, v29 -; GISEL-NEXT: v_accvgpr_write_b32 a14, v30 -; GISEL-NEXT: v_accvgpr_write_b32 a15, v31 +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13] +; GISEL-NEXT: v_mov_b32_e32 v24, s20 +; GISEL-NEXT: v_mov_b32_e32 v25, s21 +; GISEL-NEXT: v_mov_b32_e32 v26, s22 +; GISEL-NEXT: v_mov_b32_e32 v27, s23 +; GISEL-NEXT: v_mov_b32_e32 v28, s24 +; GISEL-NEXT: v_mov_b32_e32 v29, s25 +; GISEL-NEXT: v_mov_b32_e32 v30, s26 +; GISEL-NEXT: v_mov_b32_e32 v31, s27 +; GISEL-NEXT: v_mov_b32_e32 v32, s28 +; GISEL-NEXT: v_mov_b32_e32 v33, s29 +; GISEL-NEXT: v_accvgpr_write_b32 a0, v24 +; GISEL-NEXT: v_accvgpr_write_b32 a1, v25 +; GISEL-NEXT: v_accvgpr_write_b32 a2, v26 +; GISEL-NEXT: v_accvgpr_write_b32 a3, v27 +; GISEL-NEXT: v_accvgpr_write_b32 a4, v28 +; GISEL-NEXT: v_accvgpr_write_b32 a5, v29 +; GISEL-NEXT: v_accvgpr_write_b32 a6, v30 +; GISEL-NEXT: v_accvgpr_write_b32 a7, v31 +; GISEL-NEXT: v_accvgpr_write_b32 a8, v32 +; GISEL-NEXT: v_accvgpr_write_b32 a9, v33 +; GISEL-NEXT: v_accvgpr_write_b32 a10, v8 +; GISEL-NEXT: v_accvgpr_write_b32 a11, v9 +; GISEL-NEXT: v_accvgpr_write_b32 a12, v10 +; GISEL-NEXT: v_accvgpr_write_b32 a13, v11 +; GISEL-NEXT: v_accvgpr_write_b32 a14, v12 +; GISEL-NEXT: v_accvgpr_write_b32 a15, v13 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 3 @@ -6204,4 +6118,4 @@ declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } attributes #1 = { "amdgpu-flat-work-group-size"="128,128" } -attributes #2 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } \ No newline at end of file +attributes #2 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll index de528d7259d7b..77d4aad5f3174 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll @@ -72,94 +72,46 @@ bb: } define <4 x float> @test_smfmac_f32_16x16x64_f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x64_f16: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x64_f16: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x64_f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x64_f16__flags0(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__flags0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__flags0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x64_f16__flags0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x64_f16__flags1(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__flags1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__flags1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x64_f16__flags1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <4 x float> %result } @@ -200,11 +152,13 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__sgpr(<8 x half> inreg %arg0, < ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-NEXT: v_mov_b32_e32 v3, s27 ; GISEL-NEXT: v_mov_b32_e32 v16, s28 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[12:15], v[4:11], v16 @@ -298,42 +252,25 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half> ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 +; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16: @@ -351,14 +288,22 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half> ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -370,42 +315,25 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags0: @@ -423,14 +351,22 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16 ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -442,42 +378,25 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags1: @@ -495,14 +414,22 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16 ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -514,108 +441,94 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0, ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v28, s0 -; SDAG-NEXT: v_mov_b32_e32 v29, s1 -; SDAG-NEXT: v_mov_b32_e32 v30, s2 -; SDAG-NEXT: v_mov_b32_e32 v31, s3 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v27, v9 -; SDAG-NEXT: v_mov_b32_e32 v26, v8 -; SDAG-NEXT: v_mov_b32_e32 v25, v7 -; SDAG-NEXT: v_mov_b32_e32 v24, v6 -; SDAG-NEXT: v_mov_b32_e32 v23, v5 -; SDAG-NEXT: v_mov_b32_e32 v22, v4 -; SDAG-NEXT: v_mov_b32_e32 v21, v3 -; SDAG-NEXT: v_mov_b32_e32 v20, v2 -; SDAG-NEXT: v_mov_b32_e32 v19, v1 -; SDAG-NEXT: v_mov_b32_e32 v18, v0 +; SDAG-NEXT: v_mov_b32_e32 v36, s0 +; SDAG-NEXT: v_mov_b32_e32 v37, s1 +; SDAG-NEXT: v_mov_b32_e32 v38, s2 +; SDAG-NEXT: v_mov_b32_e32 v39, s3 ; SDAG-NEXT: v_mov_b32_e32 v13, s25 ; SDAG-NEXT: v_mov_b32_e32 v14, s26 ; SDAG-NEXT: v_mov_b32_e32 v15, s27 ; SDAG-NEXT: v_mov_b32_e32 v16, s28 ; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: v_mov_b32_e32 v4, s20 -; SDAG-NEXT: v_mov_b32_e32 v5, s21 -; SDAG-NEXT: v_mov_b32_e32 v6, s22 -; SDAG-NEXT: v_mov_b32_e32 v7, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-NEXT: v_mov_b32_e32 v29, s17 +; SDAG-NEXT: v_mov_b32_e32 v30, s18 +; SDAG-NEXT: v_mov_b32_e32 v31, s19 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 +; SDAG-NEXT: v_mov_b32_e32 v18, v0 +; SDAG-NEXT: v_mov_b32_e32 v19, v1 +; SDAG-NEXT: v_mov_b32_e32 v20, v2 +; SDAG-NEXT: v_mov_b32_e32 v21, v3 +; SDAG-NEXT: v_mov_b32_e32 v22, v4 +; SDAG-NEXT: v_mov_b32_e32 v23, v5 +; SDAG-NEXT: v_mov_b32_e32 v24, v6 +; SDAG-NEXT: v_mov_b32_e32 v25, v7 +; SDAG-NEXT: v_mov_b32_e32 v26, v8 +; SDAG-NEXT: v_mov_b32_e32 v27, v9 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[28:31], v[0:7], v10 +; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[36:39], v[28:35], v10 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v18, s24 -; GISEL-NEXT: v_mov_b32_e32 v19, s25 -; GISEL-NEXT: v_mov_b32_e32 v24, v0 -; GISEL-NEXT: v_mov_b32_e32 v25, v1 -; GISEL-NEXT: v_mov_b32_e32 v26, v2 -; GISEL-NEXT: v_mov_b32_e32 v27, v3 -; GISEL-NEXT: v_mov_b32_e32 v28, v4 -; GISEL-NEXT: v_mov_b32_e32 v29, v5 -; GISEL-NEXT: v_mov_b32_e32 v30, v6 -; GISEL-NEXT: v_mov_b32_e32 v31, v7 -; GISEL-NEXT: v_mov_b32_e32 v32, v8 -; GISEL-NEXT: v_mov_b32_e32 v33, v9 -; GISEL-NEXT: v_mov_b32_e32 v16, v10 -; GISEL-NEXT: v_mov_b32_e32 v20, s26 -; GISEL-NEXT: v_mov_b32_e32 v21, s27 -; GISEL-NEXT: v_mov_b32_e32 v22, s28 -; GISEL-NEXT: v_mov_b32_e32 v23, s29 -; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v11, v0 +; GISEL-NEXT: v_mov_b32_e32 v12, v1 +; GISEL-NEXT: v_mov_b32_e32 v13, v2 +; GISEL-NEXT: v_mov_b32_e32 v14, v3 +; GISEL-NEXT: v_mov_b32_e32 v15, v4 +; GISEL-NEXT: v_mov_b32_e32 v16, v5 +; GISEL-NEXT: v_mov_b32_e32 v17, v6 +; GISEL-NEXT: v_mov_b32_e32 v18, v7 +; GISEL-NEXT: v_mov_b32_e32 v19, v8 +; GISEL-NEXT: v_mov_b32_e32 v20, v9 +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] +; GISEL-NEXT: v_mov_b32_e32 v21, v10 +; GISEL-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-NEXT: v_mov_b32_e32 v3, s27 +; GISEL-NEXT: v_mov_b32_e32 v4, s28 +; GISEL-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] +; GISEL-NEXT: v_mov_b32_e32 v6, v11 +; GISEL-NEXT: v_mov_b32_e32 v7, v12 +; GISEL-NEXT: v_mov_b32_e32 v8, v13 +; GISEL-NEXT: v_mov_b32_e32 v9, v14 +; GISEL-NEXT: v_mov_b32_e32 v10, v15 +; GISEL-NEXT: v_mov_b32_e32 v11, v16 +; GISEL-NEXT: v_mov_b32_e32 v12, v17 +; GISEL-NEXT: v_mov_b32_e32 v13, v18 +; GISEL-NEXT: v_mov_b32_e32 v14, v19 +; GISEL-NEXT: v_mov_b32_e32 v15, v20 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[34:37], v[48:55], v16 +; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[30:33], v[22:29], v21 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -666,17 +579,12 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16(<8 x bfloat> %arg0, <16 x bflo ; GCN-LABEL: test_smfmac_f32_16x16x64_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 +; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result @@ -686,17 +594,12 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags0(<8 x bfloat> %arg0, <1 ; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__flags0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <4 x float> %result @@ -706,17 +609,12 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags1(<8 x bfloat> %arg0, <1 ; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__flags1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <4 x float> %result @@ -807,42 +705,25 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfl ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 +; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -852,42 +733,25 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, < ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <16 x float> %result @@ -897,42 +761,25 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, < ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <16 x float> %result @@ -942,70 +789,54 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__sgpr: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v28, s0 -; GCN-NEXT: v_mov_b32_e32 v29, s1 -; GCN-NEXT: v_mov_b32_e32 v30, s2 -; GCN-NEXT: v_mov_b32_e32 v31, s3 -; GCN-NEXT: v_mov_b32_e32 v12, s24 -; GCN-NEXT: v_mov_b32_e32 v27, v9 -; GCN-NEXT: v_mov_b32_e32 v26, v8 -; GCN-NEXT: v_mov_b32_e32 v25, v7 -; GCN-NEXT: v_mov_b32_e32 v24, v6 -; GCN-NEXT: v_mov_b32_e32 v23, v5 -; GCN-NEXT: v_mov_b32_e32 v22, v4 -; GCN-NEXT: v_mov_b32_e32 v21, v3 -; GCN-NEXT: v_mov_b32_e32 v20, v2 -; GCN-NEXT: v_mov_b32_e32 v19, v1 -; GCN-NEXT: v_mov_b32_e32 v18, v0 +; GCN-NEXT: v_mov_b32_e32 v36, s0 +; GCN-NEXT: v_mov_b32_e32 v37, s1 +; GCN-NEXT: v_mov_b32_e32 v38, s2 +; GCN-NEXT: v_mov_b32_e32 v39, s3 ; GCN-NEXT: v_mov_b32_e32 v13, s25 ; GCN-NEXT: v_mov_b32_e32 v14, s26 ; GCN-NEXT: v_mov_b32_e32 v15, s27 ; GCN-NEXT: v_mov_b32_e32 v16, s28 ; GCN-NEXT: v_mov_b32_e32 v17, s29 -; GCN-NEXT: v_accvgpr_write_b32 a0, v12 -; GCN-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NEXT: v_mov_b32_e32 v4, s20 -; GCN-NEXT: v_mov_b32_e32 v5, s21 -; GCN-NEXT: v_mov_b32_e32 v6, s22 -; GCN-NEXT: v_mov_b32_e32 v7, s23 -; GCN-NEXT: v_accvgpr_write_b32 a1, v13 -; GCN-NEXT: v_accvgpr_write_b32 a2, v14 -; GCN-NEXT: v_accvgpr_write_b32 a3, v15 -; GCN-NEXT: v_accvgpr_write_b32 a4, v16 -; GCN-NEXT: v_accvgpr_write_b32 a5, v17 -; GCN-NEXT: v_accvgpr_write_b32 a6, v18 -; GCN-NEXT: v_accvgpr_write_b32 a7, v19 -; GCN-NEXT: v_accvgpr_write_b32 a8, v20 -; GCN-NEXT: v_accvgpr_write_b32 a9, v21 -; GCN-NEXT: v_accvgpr_write_b32 a10, v22 -; GCN-NEXT: v_accvgpr_write_b32 a11, v23 -; GCN-NEXT: v_accvgpr_write_b32 a12, v24 -; GCN-NEXT: v_accvgpr_write_b32 a13, v25 -; GCN-NEXT: v_accvgpr_write_b32 a14, v26 -; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: v_mov_b32_e32 v28, s16 +; GCN-NEXT: v_mov_b32_e32 v29, s17 +; GCN-NEXT: v_mov_b32_e32 v30, s18 +; GCN-NEXT: v_mov_b32_e32 v31, s19 +; GCN-NEXT: v_mov_b32_e32 v32, s20 +; GCN-NEXT: v_mov_b32_e32 v33, s21 +; GCN-NEXT: v_mov_b32_e32 v34, s22 +; GCN-NEXT: v_mov_b32_e32 v35, s23 +; GCN-NEXT: v_mov_b32_e32 v12, s24 +; GCN-NEXT: v_mov_b32_e32 v18, v0 +; GCN-NEXT: v_mov_b32_e32 v19, v1 +; GCN-NEXT: v_mov_b32_e32 v20, v2 +; GCN-NEXT: v_mov_b32_e32 v21, v3 +; GCN-NEXT: v_mov_b32_e32 v22, v4 +; GCN-NEXT: v_mov_b32_e32 v23, v5 +; GCN-NEXT: v_mov_b32_e32 v24, v6 +; GCN-NEXT: v_mov_b32_e32 v25, v7 +; GCN-NEXT: v_mov_b32_e32 v26, v8 +; GCN-NEXT: v_mov_b32_e32 v27, v9 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[28:31], v[0:7], v10 +; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[36:39], v[28:35], v10 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 3 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: v_accvgpr_read_b32 v4, a4 -; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: v_accvgpr_read_b32 v6, a6 -; GCN-NEXT: v_accvgpr_read_b32 v7, a7 -; GCN-NEXT: v_accvgpr_read_b32 v8, a8 -; GCN-NEXT: v_accvgpr_read_b32 v9, a9 -; GCN-NEXT: v_accvgpr_read_b32 v10, a10 -; GCN-NEXT: v_accvgpr_read_b32 v11, a11 -; GCN-NEXT: v_accvgpr_read_b32 v12, a12 -; GCN-NEXT: v_accvgpr_read_b32 v13, a13 -; GCN-NEXT: v_accvgpr_read_b32 v14, a14 -; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v20 +; GCN-NEXT: v_mov_b32_e32 v9, v21 +; GCN-NEXT: v_mov_b32_e32 v10, v22 +; GCN-NEXT: v_mov_b32_e32 v11, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v24 +; GCN-NEXT: v_mov_b32_e32 v13, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v26 +; GCN-NEXT: v_mov_b32_e32 v15, v27 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -1085,46 +916,73 @@ bb: } define <4 x i32> @test_smfmac_i32_16x16x128_i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_i32_16x16x128_i8: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_i32_16x16x128_i8: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_i32_16x16x128_i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x i32> %result } define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__flags0: +; GCN-LABEL: test_smfmac_i32_16x16x128_i8__flags0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) + ret <4 x i32> %result +} + +define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) { +; GCN-LABEL: test_smfmac_i32_16x16x128_i8__flags1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) + ret <4 x i32> %result +} + +define <4 x i32> @test_smfmac_i32_16x16x128_i8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x i32> inreg %arg2, i32 inreg %arg3) { +; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 +; SDAG-NEXT: v_mov_b32_e32 v8, s0 +; SDAG-NEXT: v_mov_b32_e32 v9, s1 +; SDAG-NEXT: v_mov_b32_e32 v10, s2 +; SDAG-NEXT: v_mov_b32_e32 v11, s3 +; SDAG-NEXT: v_mov_b32_e32 v0, s16 +; SDAG-NEXT: v_mov_b32_e32 v1, s17 +; SDAG-NEXT: v_mov_b32_e32 v2, s18 +; SDAG-NEXT: v_mov_b32_e32 v3, s19 +; SDAG-NEXT: v_mov_b32_e32 v4, s20 +; SDAG-NEXT: v_mov_b32_e32 v5, s21 +; SDAG-NEXT: v_mov_b32_e32 v6, s22 +; SDAG-NEXT: v_mov_b32_e32 v7, s23 +; SDAG-NEXT: v_accvgpr_write_b32 a0, s24 +; SDAG-NEXT: v_accvgpr_write_b32 a1, s25 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s26 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s27 +; SDAG-NEXT: v_mov_b32_e32 v12, s28 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[8:11], v[0:7], v12 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -1132,92 +990,19 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags0(<4 x i32> %arg0, <8 x i32 ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__flags0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) - ret <4 x i32> %result -} - -define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__flags1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__flags1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) - ret <4 x i32> %result -} - -define <4 x i32> @test_smfmac_i32_16x16x128_i8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x i32> inreg %arg2, i32 inreg %arg3) { -; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__sgpr: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v8, s0 -; SDAG-NEXT: v_mov_b32_e32 v9, s1 -; SDAG-NEXT: v_mov_b32_e32 v10, s2 -; SDAG-NEXT: v_mov_b32_e32 v11, s3 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: v_mov_b32_e32 v4, s20 -; SDAG-NEXT: v_mov_b32_e32 v5, s21 -; SDAG-NEXT: v_mov_b32_e32 v6, s22 -; SDAG-NEXT: v_mov_b32_e32 v7, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s24 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s25 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s26 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s27 -; SDAG-NEXT: v_mov_b32_e32 v12, s28 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[8:11], v[0:7], v12 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__sgpr: +; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-NEXT: v_mov_b32_e32 v3, s27 ; GISEL-NEXT: v_mov_b32_e32 v16, s28 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[0:3], v[12:15], v[4:11], v16 @@ -1317,42 +1102,25 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1, ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 +; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8: @@ -1370,14 +1138,22 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1, ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1389,42 +1165,25 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags0: @@ -1442,14 +1201,22 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32 ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1461,42 +1228,25 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags1: @@ -1514,14 +1264,22 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32 ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1533,108 +1291,94 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v28, s0 -; SDAG-NEXT: v_mov_b32_e32 v29, s1 -; SDAG-NEXT: v_mov_b32_e32 v30, s2 -; SDAG-NEXT: v_mov_b32_e32 v31, s3 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v27, v9 -; SDAG-NEXT: v_mov_b32_e32 v26, v8 -; SDAG-NEXT: v_mov_b32_e32 v25, v7 -; SDAG-NEXT: v_mov_b32_e32 v24, v6 -; SDAG-NEXT: v_mov_b32_e32 v23, v5 -; SDAG-NEXT: v_mov_b32_e32 v22, v4 -; SDAG-NEXT: v_mov_b32_e32 v21, v3 -; SDAG-NEXT: v_mov_b32_e32 v20, v2 -; SDAG-NEXT: v_mov_b32_e32 v19, v1 -; SDAG-NEXT: v_mov_b32_e32 v18, v0 +; SDAG-NEXT: v_mov_b32_e32 v36, s0 +; SDAG-NEXT: v_mov_b32_e32 v37, s1 +; SDAG-NEXT: v_mov_b32_e32 v38, s2 +; SDAG-NEXT: v_mov_b32_e32 v39, s3 ; SDAG-NEXT: v_mov_b32_e32 v13, s25 ; SDAG-NEXT: v_mov_b32_e32 v14, s26 ; SDAG-NEXT: v_mov_b32_e32 v15, s27 ; SDAG-NEXT: v_mov_b32_e32 v16, s28 ; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: v_mov_b32_e32 v4, s20 -; SDAG-NEXT: v_mov_b32_e32 v5, s21 -; SDAG-NEXT: v_mov_b32_e32 v6, s22 -; SDAG-NEXT: v_mov_b32_e32 v7, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-NEXT: v_mov_b32_e32 v29, s17 +; SDAG-NEXT: v_mov_b32_e32 v30, s18 +; SDAG-NEXT: v_mov_b32_e32 v31, s19 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 +; SDAG-NEXT: v_mov_b32_e32 v18, v0 +; SDAG-NEXT: v_mov_b32_e32 v19, v1 +; SDAG-NEXT: v_mov_b32_e32 v20, v2 +; SDAG-NEXT: v_mov_b32_e32 v21, v3 +; SDAG-NEXT: v_mov_b32_e32 v22, v4 +; SDAG-NEXT: v_mov_b32_e32 v23, v5 +; SDAG-NEXT: v_mov_b32_e32 v24, v6 +; SDAG-NEXT: v_mov_b32_e32 v25, v7 +; SDAG-NEXT: v_mov_b32_e32 v26, v8 +; SDAG-NEXT: v_mov_b32_e32 v27, v9 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[28:31], v[0:7], v10 +; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[36:39], v[28:35], v10 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v18, s24 -; GISEL-NEXT: v_mov_b32_e32 v19, s25 -; GISEL-NEXT: v_mov_b32_e32 v24, v0 -; GISEL-NEXT: v_mov_b32_e32 v25, v1 -; GISEL-NEXT: v_mov_b32_e32 v26, v2 -; GISEL-NEXT: v_mov_b32_e32 v27, v3 -; GISEL-NEXT: v_mov_b32_e32 v28, v4 -; GISEL-NEXT: v_mov_b32_e32 v29, v5 -; GISEL-NEXT: v_mov_b32_e32 v30, v6 -; GISEL-NEXT: v_mov_b32_e32 v31, v7 -; GISEL-NEXT: v_mov_b32_e32 v32, v8 -; GISEL-NEXT: v_mov_b32_e32 v33, v9 -; GISEL-NEXT: v_mov_b32_e32 v16, v10 -; GISEL-NEXT: v_mov_b32_e32 v20, s26 -; GISEL-NEXT: v_mov_b32_e32 v21, s27 -; GISEL-NEXT: v_mov_b32_e32 v22, s28 -; GISEL-NEXT: v_mov_b32_e32 v23, s29 -; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v11, v0 +; GISEL-NEXT: v_mov_b32_e32 v12, v1 +; GISEL-NEXT: v_mov_b32_e32 v13, v2 +; GISEL-NEXT: v_mov_b32_e32 v14, v3 +; GISEL-NEXT: v_mov_b32_e32 v15, v4 +; GISEL-NEXT: v_mov_b32_e32 v16, v5 +; GISEL-NEXT: v_mov_b32_e32 v17, v6 +; GISEL-NEXT: v_mov_b32_e32 v18, v7 +; GISEL-NEXT: v_mov_b32_e32 v19, v8 +; GISEL-NEXT: v_mov_b32_e32 v20, v9 +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] +; GISEL-NEXT: v_mov_b32_e32 v21, v10 +; GISEL-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-NEXT: v_mov_b32_e32 v3, s27 +; GISEL-NEXT: v_mov_b32_e32 v4, s28 +; GISEL-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] +; GISEL-NEXT: v_mov_b32_e32 v6, v11 +; GISEL-NEXT: v_mov_b32_e32 v7, v12 +; GISEL-NEXT: v_mov_b32_e32 v8, v13 +; GISEL-NEXT: v_mov_b32_e32 v9, v14 +; GISEL-NEXT: v_mov_b32_e32 v10, v15 +; GISEL-NEXT: v_mov_b32_e32 v11, v16 +; GISEL-NEXT: v_mov_b32_e32 v12, v17 +; GISEL-NEXT: v_mov_b32_e32 v13, v18 +; GISEL-NEXT: v_mov_b32_e32 v14, v19 +; GISEL-NEXT: v_mov_b32_e32 v15, v20 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[34:37], v[48:55], v16 +; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[30:33], v[22:29], v21 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x i32> %result @@ -1714,94 +1458,46 @@ bb: } define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <4 x float> %result } @@ -1842,11 +1538,13 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-NEXT: v_mov_b32_e32 v3, s27 ; GISEL-NEXT: v_mov_b32_e32 v16, s28 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[0:3], v[12:15], v[4:11], v16 @@ -1929,94 +1627,46 @@ bb: } define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <4 x float> %result } @@ -2057,11 +1707,13 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-NEXT: v_mov_b32_e32 v3, s27 ; GISEL-NEXT: v_mov_b32_e32 v16, s28 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[0:3], v[12:15], v[4:11], v16 @@ -2144,94 +1796,46 @@ bb: } define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <4 x float> %result } @@ -2272,11 +1876,13 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-NEXT: v_mov_b32_e32 v3, s27 ; GISEL-NEXT: v_mov_b32_e32 v16, s28 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[0:3], v[12:15], v[4:11], v16 @@ -2359,94 +1965,46 @@ bb: } define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) - ret <4 x float> %result -} - -define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] + ret <4 x float> %result +} + +define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) { +; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: v_mov_b32_e32 v0, v12 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v14 +; GCN-NEXT: v_mov_b32_e32 v3, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <4 x float> %result } @@ -2487,11 +2045,13 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__sgpr(<4 x i32> inreg %arg ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-NEXT: v_mov_b32_e32 v3, s27 ; GISEL-NEXT: v_mov_b32_e32 v16, s28 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[0:3], v[12:15], v[4:11], v16 @@ -2591,42 +2151,25 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32> ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8: @@ -2644,14 +2187,22 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32> ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2663,42 +2214,25 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0: @@ -2716,14 +2250,22 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, < ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2735,42 +2277,25 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1: @@ -2788,14 +2313,22 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, < ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2807,108 +2340,94 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v28, s0 -; SDAG-NEXT: v_mov_b32_e32 v29, s1 -; SDAG-NEXT: v_mov_b32_e32 v30, s2 -; SDAG-NEXT: v_mov_b32_e32 v31, s3 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v27, v9 -; SDAG-NEXT: v_mov_b32_e32 v26, v8 -; SDAG-NEXT: v_mov_b32_e32 v25, v7 -; SDAG-NEXT: v_mov_b32_e32 v24, v6 -; SDAG-NEXT: v_mov_b32_e32 v23, v5 -; SDAG-NEXT: v_mov_b32_e32 v22, v4 -; SDAG-NEXT: v_mov_b32_e32 v21, v3 -; SDAG-NEXT: v_mov_b32_e32 v20, v2 -; SDAG-NEXT: v_mov_b32_e32 v19, v1 -; SDAG-NEXT: v_mov_b32_e32 v18, v0 +; SDAG-NEXT: v_mov_b32_e32 v36, s0 +; SDAG-NEXT: v_mov_b32_e32 v37, s1 +; SDAG-NEXT: v_mov_b32_e32 v38, s2 +; SDAG-NEXT: v_mov_b32_e32 v39, s3 ; SDAG-NEXT: v_mov_b32_e32 v13, s25 ; SDAG-NEXT: v_mov_b32_e32 v14, s26 ; SDAG-NEXT: v_mov_b32_e32 v15, s27 ; SDAG-NEXT: v_mov_b32_e32 v16, s28 ; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: v_mov_b32_e32 v4, s20 -; SDAG-NEXT: v_mov_b32_e32 v5, s21 -; SDAG-NEXT: v_mov_b32_e32 v6, s22 -; SDAG-NEXT: v_mov_b32_e32 v7, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-NEXT: v_mov_b32_e32 v29, s17 +; SDAG-NEXT: v_mov_b32_e32 v30, s18 +; SDAG-NEXT: v_mov_b32_e32 v31, s19 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 +; SDAG-NEXT: v_mov_b32_e32 v18, v0 +; SDAG-NEXT: v_mov_b32_e32 v19, v1 +; SDAG-NEXT: v_mov_b32_e32 v20, v2 +; SDAG-NEXT: v_mov_b32_e32 v21, v3 +; SDAG-NEXT: v_mov_b32_e32 v22, v4 +; SDAG-NEXT: v_mov_b32_e32 v23, v5 +; SDAG-NEXT: v_mov_b32_e32 v24, v6 +; SDAG-NEXT: v_mov_b32_e32 v25, v7 +; SDAG-NEXT: v_mov_b32_e32 v26, v8 +; SDAG-NEXT: v_mov_b32_e32 v27, v9 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[28:31], v[0:7], v10 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[36:39], v[28:35], v10 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v18, s24 -; GISEL-NEXT: v_mov_b32_e32 v19, s25 -; GISEL-NEXT: v_mov_b32_e32 v24, v0 -; GISEL-NEXT: v_mov_b32_e32 v25, v1 -; GISEL-NEXT: v_mov_b32_e32 v26, v2 -; GISEL-NEXT: v_mov_b32_e32 v27, v3 -; GISEL-NEXT: v_mov_b32_e32 v28, v4 -; GISEL-NEXT: v_mov_b32_e32 v29, v5 -; GISEL-NEXT: v_mov_b32_e32 v30, v6 -; GISEL-NEXT: v_mov_b32_e32 v31, v7 -; GISEL-NEXT: v_mov_b32_e32 v32, v8 -; GISEL-NEXT: v_mov_b32_e32 v33, v9 -; GISEL-NEXT: v_mov_b32_e32 v16, v10 -; GISEL-NEXT: v_mov_b32_e32 v20, s26 -; GISEL-NEXT: v_mov_b32_e32 v21, s27 -; GISEL-NEXT: v_mov_b32_e32 v22, s28 -; GISEL-NEXT: v_mov_b32_e32 v23, s29 -; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v11, v0 +; GISEL-NEXT: v_mov_b32_e32 v12, v1 +; GISEL-NEXT: v_mov_b32_e32 v13, v2 +; GISEL-NEXT: v_mov_b32_e32 v14, v3 +; GISEL-NEXT: v_mov_b32_e32 v15, v4 +; GISEL-NEXT: v_mov_b32_e32 v16, v5 +; GISEL-NEXT: v_mov_b32_e32 v17, v6 +; GISEL-NEXT: v_mov_b32_e32 v18, v7 +; GISEL-NEXT: v_mov_b32_e32 v19, v8 +; GISEL-NEXT: v_mov_b32_e32 v20, v9 +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] +; GISEL-NEXT: v_mov_b32_e32 v21, v10 +; GISEL-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-NEXT: v_mov_b32_e32 v3, s27 +; GISEL-NEXT: v_mov_b32_e32 v4, s28 +; GISEL-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] +; GISEL-NEXT: v_mov_b32_e32 v6, v11 +; GISEL-NEXT: v_mov_b32_e32 v7, v12 +; GISEL-NEXT: v_mov_b32_e32 v8, v13 +; GISEL-NEXT: v_mov_b32_e32 v9, v14 +; GISEL-NEXT: v_mov_b32_e32 v10, v15 +; GISEL-NEXT: v_mov_b32_e32 v11, v16 +; GISEL-NEXT: v_mov_b32_e32 v12, v17 +; GISEL-NEXT: v_mov_b32_e32 v13, v18 +; GISEL-NEXT: v_mov_b32_e32 v14, v19 +; GISEL-NEXT: v_mov_b32_e32 v15, v20 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[34:37], v[48:55], v16 +; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[30:33], v[22:29], v21 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -3005,42 +2524,25 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32> ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8: @@ -3058,14 +2560,22 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32> ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -3077,42 +2587,25 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0: @@ -3130,14 +2623,22 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, < ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -3149,42 +2650,25 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1: @@ -3202,14 +2686,22 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, < ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -3221,108 +2713,94 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v28, s0 -; SDAG-NEXT: v_mov_b32_e32 v29, s1 -; SDAG-NEXT: v_mov_b32_e32 v30, s2 -; SDAG-NEXT: v_mov_b32_e32 v31, s3 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v27, v9 -; SDAG-NEXT: v_mov_b32_e32 v26, v8 -; SDAG-NEXT: v_mov_b32_e32 v25, v7 -; SDAG-NEXT: v_mov_b32_e32 v24, v6 -; SDAG-NEXT: v_mov_b32_e32 v23, v5 -; SDAG-NEXT: v_mov_b32_e32 v22, v4 -; SDAG-NEXT: v_mov_b32_e32 v21, v3 -; SDAG-NEXT: v_mov_b32_e32 v20, v2 -; SDAG-NEXT: v_mov_b32_e32 v19, v1 -; SDAG-NEXT: v_mov_b32_e32 v18, v0 +; SDAG-NEXT: v_mov_b32_e32 v36, s0 +; SDAG-NEXT: v_mov_b32_e32 v37, s1 +; SDAG-NEXT: v_mov_b32_e32 v38, s2 +; SDAG-NEXT: v_mov_b32_e32 v39, s3 ; SDAG-NEXT: v_mov_b32_e32 v13, s25 ; SDAG-NEXT: v_mov_b32_e32 v14, s26 ; SDAG-NEXT: v_mov_b32_e32 v15, s27 ; SDAG-NEXT: v_mov_b32_e32 v16, s28 ; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: v_mov_b32_e32 v4, s20 -; SDAG-NEXT: v_mov_b32_e32 v5, s21 -; SDAG-NEXT: v_mov_b32_e32 v6, s22 -; SDAG-NEXT: v_mov_b32_e32 v7, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-NEXT: v_mov_b32_e32 v29, s17 +; SDAG-NEXT: v_mov_b32_e32 v30, s18 +; SDAG-NEXT: v_mov_b32_e32 v31, s19 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 +; SDAG-NEXT: v_mov_b32_e32 v18, v0 +; SDAG-NEXT: v_mov_b32_e32 v19, v1 +; SDAG-NEXT: v_mov_b32_e32 v20, v2 +; SDAG-NEXT: v_mov_b32_e32 v21, v3 +; SDAG-NEXT: v_mov_b32_e32 v22, v4 +; SDAG-NEXT: v_mov_b32_e32 v23, v5 +; SDAG-NEXT: v_mov_b32_e32 v24, v6 +; SDAG-NEXT: v_mov_b32_e32 v25, v7 +; SDAG-NEXT: v_mov_b32_e32 v26, v8 +; SDAG-NEXT: v_mov_b32_e32 v27, v9 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[28:31], v[0:7], v10 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[36:39], v[28:35], v10 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v18, s24 -; GISEL-NEXT: v_mov_b32_e32 v19, s25 -; GISEL-NEXT: v_mov_b32_e32 v24, v0 -; GISEL-NEXT: v_mov_b32_e32 v25, v1 -; GISEL-NEXT: v_mov_b32_e32 v26, v2 -; GISEL-NEXT: v_mov_b32_e32 v27, v3 -; GISEL-NEXT: v_mov_b32_e32 v28, v4 -; GISEL-NEXT: v_mov_b32_e32 v29, v5 -; GISEL-NEXT: v_mov_b32_e32 v30, v6 -; GISEL-NEXT: v_mov_b32_e32 v31, v7 -; GISEL-NEXT: v_mov_b32_e32 v32, v8 -; GISEL-NEXT: v_mov_b32_e32 v33, v9 -; GISEL-NEXT: v_mov_b32_e32 v16, v10 -; GISEL-NEXT: v_mov_b32_e32 v20, s26 -; GISEL-NEXT: v_mov_b32_e32 v21, s27 -; GISEL-NEXT: v_mov_b32_e32 v22, s28 -; GISEL-NEXT: v_mov_b32_e32 v23, s29 -; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v11, v0 +; GISEL-NEXT: v_mov_b32_e32 v12, v1 +; GISEL-NEXT: v_mov_b32_e32 v13, v2 +; GISEL-NEXT: v_mov_b32_e32 v14, v3 +; GISEL-NEXT: v_mov_b32_e32 v15, v4 +; GISEL-NEXT: v_mov_b32_e32 v16, v5 +; GISEL-NEXT: v_mov_b32_e32 v17, v6 +; GISEL-NEXT: v_mov_b32_e32 v18, v7 +; GISEL-NEXT: v_mov_b32_e32 v19, v8 +; GISEL-NEXT: v_mov_b32_e32 v20, v9 +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] +; GISEL-NEXT: v_mov_b32_e32 v21, v10 +; GISEL-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-NEXT: v_mov_b32_e32 v3, s27 +; GISEL-NEXT: v_mov_b32_e32 v4, s28 +; GISEL-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] +; GISEL-NEXT: v_mov_b32_e32 v6, v11 +; GISEL-NEXT: v_mov_b32_e32 v7, v12 +; GISEL-NEXT: v_mov_b32_e32 v8, v13 +; GISEL-NEXT: v_mov_b32_e32 v9, v14 +; GISEL-NEXT: v_mov_b32_e32 v10, v15 +; GISEL-NEXT: v_mov_b32_e32 v11, v16 +; GISEL-NEXT: v_mov_b32_e32 v12, v17 +; GISEL-NEXT: v_mov_b32_e32 v13, v18 +; GISEL-NEXT: v_mov_b32_e32 v14, v19 +; GISEL-NEXT: v_mov_b32_e32 v15, v20 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[34:37], v[48:55], v16 +; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[30:33], v[22:29], v21 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -3419,42 +2897,25 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32> ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8: @@ -3472,14 +2933,22 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32> ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -3491,42 +2960,25 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0: @@ -3544,14 +2996,22 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, < ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -3563,42 +3023,25 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1: @@ -3616,14 +3059,22 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, < ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -3635,108 +3086,94 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v28, s0 -; SDAG-NEXT: v_mov_b32_e32 v29, s1 -; SDAG-NEXT: v_mov_b32_e32 v30, s2 -; SDAG-NEXT: v_mov_b32_e32 v31, s3 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v27, v9 -; SDAG-NEXT: v_mov_b32_e32 v26, v8 -; SDAG-NEXT: v_mov_b32_e32 v25, v7 -; SDAG-NEXT: v_mov_b32_e32 v24, v6 -; SDAG-NEXT: v_mov_b32_e32 v23, v5 -; SDAG-NEXT: v_mov_b32_e32 v22, v4 -; SDAG-NEXT: v_mov_b32_e32 v21, v3 -; SDAG-NEXT: v_mov_b32_e32 v20, v2 -; SDAG-NEXT: v_mov_b32_e32 v19, v1 -; SDAG-NEXT: v_mov_b32_e32 v18, v0 +; SDAG-NEXT: v_mov_b32_e32 v36, s0 +; SDAG-NEXT: v_mov_b32_e32 v37, s1 +; SDAG-NEXT: v_mov_b32_e32 v38, s2 +; SDAG-NEXT: v_mov_b32_e32 v39, s3 ; SDAG-NEXT: v_mov_b32_e32 v13, s25 ; SDAG-NEXT: v_mov_b32_e32 v14, s26 ; SDAG-NEXT: v_mov_b32_e32 v15, s27 ; SDAG-NEXT: v_mov_b32_e32 v16, s28 ; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: v_mov_b32_e32 v4, s20 -; SDAG-NEXT: v_mov_b32_e32 v5, s21 -; SDAG-NEXT: v_mov_b32_e32 v6, s22 -; SDAG-NEXT: v_mov_b32_e32 v7, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-NEXT: v_mov_b32_e32 v29, s17 +; SDAG-NEXT: v_mov_b32_e32 v30, s18 +; SDAG-NEXT: v_mov_b32_e32 v31, s19 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 +; SDAG-NEXT: v_mov_b32_e32 v18, v0 +; SDAG-NEXT: v_mov_b32_e32 v19, v1 +; SDAG-NEXT: v_mov_b32_e32 v20, v2 +; SDAG-NEXT: v_mov_b32_e32 v21, v3 +; SDAG-NEXT: v_mov_b32_e32 v22, v4 +; SDAG-NEXT: v_mov_b32_e32 v23, v5 +; SDAG-NEXT: v_mov_b32_e32 v24, v6 +; SDAG-NEXT: v_mov_b32_e32 v25, v7 +; SDAG-NEXT: v_mov_b32_e32 v26, v8 +; SDAG-NEXT: v_mov_b32_e32 v27, v9 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[28:31], v[0:7], v10 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[36:39], v[28:35], v10 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v18, s24 -; GISEL-NEXT: v_mov_b32_e32 v19, s25 -; GISEL-NEXT: v_mov_b32_e32 v24, v0 -; GISEL-NEXT: v_mov_b32_e32 v25, v1 -; GISEL-NEXT: v_mov_b32_e32 v26, v2 -; GISEL-NEXT: v_mov_b32_e32 v27, v3 -; GISEL-NEXT: v_mov_b32_e32 v28, v4 -; GISEL-NEXT: v_mov_b32_e32 v29, v5 -; GISEL-NEXT: v_mov_b32_e32 v30, v6 -; GISEL-NEXT: v_mov_b32_e32 v31, v7 -; GISEL-NEXT: v_mov_b32_e32 v32, v8 -; GISEL-NEXT: v_mov_b32_e32 v33, v9 -; GISEL-NEXT: v_mov_b32_e32 v16, v10 -; GISEL-NEXT: v_mov_b32_e32 v20, s26 -; GISEL-NEXT: v_mov_b32_e32 v21, s27 -; GISEL-NEXT: v_mov_b32_e32 v22, s28 -; GISEL-NEXT: v_mov_b32_e32 v23, s29 -; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v11, v0 +; GISEL-NEXT: v_mov_b32_e32 v12, v1 +; GISEL-NEXT: v_mov_b32_e32 v13, v2 +; GISEL-NEXT: v_mov_b32_e32 v14, v3 +; GISEL-NEXT: v_mov_b32_e32 v15, v4 +; GISEL-NEXT: v_mov_b32_e32 v16, v5 +; GISEL-NEXT: v_mov_b32_e32 v17, v6 +; GISEL-NEXT: v_mov_b32_e32 v18, v7 +; GISEL-NEXT: v_mov_b32_e32 v19, v8 +; GISEL-NEXT: v_mov_b32_e32 v20, v9 +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] +; GISEL-NEXT: v_mov_b32_e32 v21, v10 +; GISEL-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-NEXT: v_mov_b32_e32 v3, s27 +; GISEL-NEXT: v_mov_b32_e32 v4, s28 +; GISEL-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] +; GISEL-NEXT: v_mov_b32_e32 v6, v11 +; GISEL-NEXT: v_mov_b32_e32 v7, v12 +; GISEL-NEXT: v_mov_b32_e32 v8, v13 +; GISEL-NEXT: v_mov_b32_e32 v9, v14 +; GISEL-NEXT: v_mov_b32_e32 v10, v15 +; GISEL-NEXT: v_mov_b32_e32 v11, v16 +; GISEL-NEXT: v_mov_b32_e32 v12, v17 +; GISEL-NEXT: v_mov_b32_e32 v13, v18 +; GISEL-NEXT: v_mov_b32_e32 v14, v19 +; GISEL-NEXT: v_mov_b32_e32 v15, v20 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[34:37], v[48:55], v16 +; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[30:33], v[22:29], v21 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result @@ -3833,42 +3270,25 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32> ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8: @@ -3886,14 +3306,22 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32> ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -3905,42 +3333,25 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0: @@ -3958,14 +3369,22 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, < ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -3977,42 +3396,25 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, < ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1: @@ -4030,14 +3432,22 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, < ; GISEL-NEXT: v_mov_b32_e32 v35, v9 ; GISEL-NEXT: v_mov_b32_e32 v36, v10 ; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] +; GISEL-NEXT: v_mov_b32_e32 v0, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GISEL-NEXT: v_mov_b32_e32 v2, v14 +; GISEL-NEXT: v_mov_b32_e32 v3, v15 +; GISEL-NEXT: v_mov_b32_e32 v4, v16 +; GISEL-NEXT: v_mov_b32_e32 v5, v17 +; GISEL-NEXT: v_mov_b32_e32 v6, v18 +; GISEL-NEXT: v_mov_b32_e32 v7, v19 +; GISEL-NEXT: v_mov_b32_e32 v8, v20 +; GISEL-NEXT: v_mov_b32_e32 v9, v21 +; GISEL-NEXT: v_mov_b32_e32 v10, v22 +; GISEL-NEXT: v_mov_b32_e32 v11, v23 +; GISEL-NEXT: v_mov_b32_e32 v12, v24 +; GISEL-NEXT: v_mov_b32_e32 v13, v25 +; GISEL-NEXT: v_mov_b32_e32 v14, v26 +; GISEL-NEXT: v_mov_b32_e32 v15, v27 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -4049,108 +3459,94 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v28, s0 -; SDAG-NEXT: v_mov_b32_e32 v29, s1 -; SDAG-NEXT: v_mov_b32_e32 v30, s2 -; SDAG-NEXT: v_mov_b32_e32 v31, s3 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v27, v9 -; SDAG-NEXT: v_mov_b32_e32 v26, v8 -; SDAG-NEXT: v_mov_b32_e32 v25, v7 -; SDAG-NEXT: v_mov_b32_e32 v24, v6 -; SDAG-NEXT: v_mov_b32_e32 v23, v5 -; SDAG-NEXT: v_mov_b32_e32 v22, v4 -; SDAG-NEXT: v_mov_b32_e32 v21, v3 -; SDAG-NEXT: v_mov_b32_e32 v20, v2 -; SDAG-NEXT: v_mov_b32_e32 v19, v1 -; SDAG-NEXT: v_mov_b32_e32 v18, v0 +; SDAG-NEXT: v_mov_b32_e32 v36, s0 +; SDAG-NEXT: v_mov_b32_e32 v37, s1 +; SDAG-NEXT: v_mov_b32_e32 v38, s2 +; SDAG-NEXT: v_mov_b32_e32 v39, s3 ; SDAG-NEXT: v_mov_b32_e32 v13, s25 ; SDAG-NEXT: v_mov_b32_e32 v14, s26 ; SDAG-NEXT: v_mov_b32_e32 v15, s27 ; SDAG-NEXT: v_mov_b32_e32 v16, s28 ; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: v_mov_b32_e32 v4, s20 -; SDAG-NEXT: v_mov_b32_e32 v5, s21 -; SDAG-NEXT: v_mov_b32_e32 v6, s22 -; SDAG-NEXT: v_mov_b32_e32 v7, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 +; SDAG-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-NEXT: v_mov_b32_e32 v29, s17 +; SDAG-NEXT: v_mov_b32_e32 v30, s18 +; SDAG-NEXT: v_mov_b32_e32 v31, s19 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 +; SDAG-NEXT: v_mov_b32_e32 v12, s24 +; SDAG-NEXT: v_mov_b32_e32 v18, v0 +; SDAG-NEXT: v_mov_b32_e32 v19, v1 +; SDAG-NEXT: v_mov_b32_e32 v20, v2 +; SDAG-NEXT: v_mov_b32_e32 v21, v3 +; SDAG-NEXT: v_mov_b32_e32 v22, v4 +; SDAG-NEXT: v_mov_b32_e32 v23, v5 +; SDAG-NEXT: v_mov_b32_e32 v24, v6 +; SDAG-NEXT: v_mov_b32_e32 v25, v7 +; SDAG-NEXT: v_mov_b32_e32 v26, v8 +; SDAG-NEXT: v_mov_b32_e32 v27, v9 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[28:31], v[0:7], v10 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[36:39], v[28:35], v10 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 +; SDAG-NEXT: v_mov_b32_e32 v0, v12 +; SDAG-NEXT: v_mov_b32_e32 v1, v13 +; SDAG-NEXT: v_mov_b32_e32 v2, v14 +; SDAG-NEXT: v_mov_b32_e32 v3, v15 +; SDAG-NEXT: v_mov_b32_e32 v4, v16 +; SDAG-NEXT: v_mov_b32_e32 v5, v17 +; SDAG-NEXT: v_mov_b32_e32 v6, v18 +; SDAG-NEXT: v_mov_b32_e32 v7, v19 +; SDAG-NEXT: v_mov_b32_e32 v8, v20 +; SDAG-NEXT: v_mov_b32_e32 v9, v21 +; SDAG-NEXT: v_mov_b32_e32 v10, v22 +; SDAG-NEXT: v_mov_b32_e32 v11, v23 +; SDAG-NEXT: v_mov_b32_e32 v12, v24 +; SDAG-NEXT: v_mov_b32_e32 v13, v25 +; SDAG-NEXT: v_mov_b32_e32 v14, v26 +; SDAG-NEXT: v_mov_b32_e32 v15, v27 ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] -; GISEL-NEXT: v_mov_b32_e32 v18, s24 -; GISEL-NEXT: v_mov_b32_e32 v19, s25 -; GISEL-NEXT: v_mov_b32_e32 v24, v0 -; GISEL-NEXT: v_mov_b32_e32 v25, v1 -; GISEL-NEXT: v_mov_b32_e32 v26, v2 -; GISEL-NEXT: v_mov_b32_e32 v27, v3 -; GISEL-NEXT: v_mov_b32_e32 v28, v4 -; GISEL-NEXT: v_mov_b32_e32 v29, v5 -; GISEL-NEXT: v_mov_b32_e32 v30, v6 -; GISEL-NEXT: v_mov_b32_e32 v31, v7 -; GISEL-NEXT: v_mov_b32_e32 v32, v8 -; GISEL-NEXT: v_mov_b32_e32 v33, v9 -; GISEL-NEXT: v_mov_b32_e32 v16, v10 -; GISEL-NEXT: v_mov_b32_e32 v20, s26 -; GISEL-NEXT: v_mov_b32_e32 v21, s27 -; GISEL-NEXT: v_mov_b32_e32 v22, s28 -; GISEL-NEXT: v_mov_b32_e32 v23, s29 -; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v11, v0 +; GISEL-NEXT: v_mov_b32_e32 v12, v1 +; GISEL-NEXT: v_mov_b32_e32 v13, v2 +; GISEL-NEXT: v_mov_b32_e32 v14, v3 +; GISEL-NEXT: v_mov_b32_e32 v15, v4 +; GISEL-NEXT: v_mov_b32_e32 v16, v5 +; GISEL-NEXT: v_mov_b32_e32 v17, v6 +; GISEL-NEXT: v_mov_b32_e32 v18, v7 +; GISEL-NEXT: v_mov_b32_e32 v19, v8 +; GISEL-NEXT: v_mov_b32_e32 v20, v9 +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23] +; GISEL-NEXT: v_mov_b32_e32 v21, v10 +; GISEL-NEXT: v_mov_b32_e32 v0, s24 +; GISEL-NEXT: v_mov_b32_e32 v1, s25 +; GISEL-NEXT: v_mov_b32_e32 v2, s26 +; GISEL-NEXT: v_mov_b32_e32 v3, s27 +; GISEL-NEXT: v_mov_b32_e32 v4, s28 +; GISEL-NEXT: v_mov_b32_e32 v5, s29 +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17] +; GISEL-NEXT: v_mov_b32_e32 v6, v11 +; GISEL-NEXT: v_mov_b32_e32 v7, v12 +; GISEL-NEXT: v_mov_b32_e32 v8, v13 +; GISEL-NEXT: v_mov_b32_e32 v9, v14 +; GISEL-NEXT: v_mov_b32_e32 v10, v15 +; GISEL-NEXT: v_mov_b32_e32 v11, v16 +; GISEL-NEXT: v_mov_b32_e32 v12, v17 +; GISEL-NEXT: v_mov_b32_e32 v13, v18 +; GISEL-NEXT: v_mov_b32_e32 v14, v19 +; GISEL-NEXT: v_mov_b32_e32 v15, v20 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[34:37], v[48:55], v16 +; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[30:33], v[22:29], v21 ; GISEL-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll index 54b535ca43126..f76580b94e13c 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll @@ -5,6 +5,8 @@ ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY,GREEDY90A-GISEL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -sgpr-regalloc=fast -vgpr-regalloc=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,FAST %s +; This is better with 90a + ; Check that Dst and SrcC of MFMA instructions reading more than 4 registers as SrcC ; is either completely disjoint or exactly the same, but does not alias. @@ -15,8 +17,8 @@ declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i3 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32: ; GREEDY: v_mfma_f32_32x32x1{{.*}} a[0:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31] ; GREEDY: v_mfma_f32_32x32x1{{.*}} a[32:63], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31] -; FAST: v_mfma_f32_32x32x1{{.*}} a[64:95], v{{[0-9]+}}, v{{[0-9]+}}, a[64:95] -; FAST: v_mfma_f32_32x32x1{{.*}} a[32:63], v{{[0-9]+}}, v{{[0-9]+}}, a[64:95] +; FAST: v_mfma_f32_32x32x1{{.*}} a[32:63], v{{[0-9]+}}, v{{[0-9]+}}, a[32:63] +; FAST: v_mfma_f32_32x32x1{{.*}} a[0:31], v{{[0-9]+}}, v{{[0-9]+}}, a[32:63] ; GCN: v_mfma_f32_32x32x1{{.*}} a[0:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31] define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { bb: @@ -33,14 +35,14 @@ bb: ; GREEDY908: v_mfma_f32_16x16x1{{.*}} a[18:33], v{{[0-9]+}}, v{{[0-9]+}}, a[18:33] ; GREEDY908: v_mfma_f32_16x16x1{{.*}} a[2:17], v{{[0-9]+}}, v{{[0-9]+}}, a[18:33] -; GREEDY90A: v_mfma_f32_16x16x1{{.*}} a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[16:31] -; GREEDY90A: v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[16:31] +; GREEDY90A: v_mfma_f32_16x16x1{{.*}} a[18:33], v{{[0-9]+}}, v{{[0-9]+}}, a[18:33] +; GREEDY90A: v_mfma_f32_16x16x1{{.*}} a[2:17], v{{[0-9]+}}, v{{[0-9]+}}, a[18:33] ; GREEDY90A-GISEL: v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15] ; GREEDY90A-GISEL: v_mfma_f32_16x16x1{{.*}} a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15] -; FAST: v_mfma_f32_16x16x1{{.*}} a[32:47], v{{[0-9]+}}, v{{[0-9]+}}, a[32:47] -; FAST: v_mfma_f32_16x16x1{{.*}} a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[32:47] +; FAST: v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15] +; FAST: v_mfma_f32_16x16x1{{.*}} a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15] ; GCN: v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15] define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { bb: @@ -58,8 +60,8 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32: ; GREEDY: v_mfma_f32_4x4x1{{.*}} a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] ; GREEDY: v_mfma_f32_4x4x1{{.*}} a[2:5], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] -; FAST: v_mfma_f32_4x4x1{{.*}} a[8:11], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] -; FAST: v_mfma_f32_4x4x1{{.*}} a[4:7], v{{[0-9]+}}, v{{[0-9]+}}, a[8:11] +; FAST: v_mfma_f32_4x4x1{{.*}} a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] +; FAST: v_mfma_f32_4x4x1{{.*}} a[4:7], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] ; GCN: v_mfma_f32_4x4x1{{.*}} a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { bb: diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll index dad59daaefb5f..3844d6054e130 100644 --- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll @@ -6,32 +6,29 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-LABEL: matmul_kernel: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX942-NEXT: s_mov_b32 s2, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v1 -; GFX942-NEXT: s_mov_b32 s3, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; GFX942-NEXT: s_mov_b32 s3, 0 ; GFX942-NEXT: s_branch .LBB0_2 ; GFX942-NEXT: .LBB0_1: ; %bb2 ; GFX942-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GFX942-NEXT: s_or_b32 s4, s3, 1 ; GFX942-NEXT: s_ashr_i32 s5, s3, 31 ; GFX942-NEXT: s_mov_b32 s3, s2 -; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[2:3] -; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX942-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-NEXT: v_accvgpr_mov_b32 a0, a2 +; GFX942-NEXT: v_accvgpr_mov_b32 a2, a1 +; GFX942-NEXT: v_accvgpr_mov_b32 a3, a1 ; GFX942-NEXT: s_and_b32 s3, s5, s4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[0:3], v[4:5], v[4:5], a[0:3] +; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[2:5], v[2:3], v[2:3], a[0:3] ; GFX942-NEXT: s_cbranch_execz .LBB0_4 ; GFX942-NEXT: .LBB0_2: ; %bb ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -39,40 +36,40 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-NEXT: s_cbranch_vccz .LBB0_1 ; GFX942-NEXT: ; %bb.3: ; GFX942-NEXT: ; implicit-def: $sgpr3 -; GFX942-NEXT: ; implicit-def: $agpr0 +; GFX942-NEXT: ; implicit-def: $agpr2 ; GFX942-NEXT: .LBB0_4: ; %common.ret ; GFX942-NEXT: s_endpgm ; ; GFX908-LABEL: matmul_kernel: ; GFX908: ; %bb.0: ; %entry ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX908-NEXT: v_mov_b32_e32 v1, 0 +; GFX908-NEXT: v_mov_b32_e32 v0, 0 +; GFX908-NEXT: v_accvgpr_write_b32 a1, 0 ; GFX908-NEXT: s_mov_b32 s2, 0 -; GFX908-NEXT: s_mov_b32 s3, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a0, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_cmp_lg_u32 s0, 0 ; GFX908-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX908-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; GFX908-NEXT: s_mov_b32 s3, 0 ; GFX908-NEXT: s_branch .LBB0_2 ; GFX908-NEXT: .LBB0_1: ; %bb2 ; GFX908-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; GFX908-NEXT: s_or_b32 s4, s3, 1 ; GFX908-NEXT: s_ashr_i32 s5, s3, 31 ; GFX908-NEXT: s_mov_b32 s3, s2 -; GFX908-NEXT: s_nop 3 -; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX908-NEXT: v_mov_b32_e32 v5, s3 -; GFX908-NEXT: v_mov_b32_e32 v4, s2 -; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_mov_b32_e32 v3, v1 +; GFX908-NEXT: v_mov_b32_e32 v1, s2 +; GFX908-NEXT: s_nop 2 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a2 +; GFX908-NEXT: v_mov_b32_e32 v2, s3 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a1 ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 -; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a2, v4 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 ; GFX908-NEXT: s_and_b32 s3, s5, s4 -; GFX908-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[4:5], v[4:5], a[0:3] +; GFX908-NEXT: v_mfma_f32_16x16x16f16 a[2:5], v[1:2], v[1:2], a[0:3] ; GFX908-NEXT: s_cbranch_execz .LBB0_4 ; GFX908-NEXT: .LBB0_2: ; %bb ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -80,7 +77,7 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX908-NEXT: s_cbranch_vccz .LBB0_1 ; GFX908-NEXT: ; %bb.3: ; GFX908-NEXT: ; implicit-def: $sgpr3 -; GFX908-NEXT: ; implicit-def: $agpr0 +; GFX908-NEXT: ; implicit-def: $agpr2 ; GFX908-NEXT: .LBB0_4: ; %common.ret ; GFX908-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir index 5c83170563e59..ee5481617cf59 100644 --- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir @@ -45,8 +45,11 @@ body: | ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:areg_128_align2 = COPY [[REG_SEQUENCE]] - ; CHECK-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY4]], [[COPY4]], killed [[COPY5]], 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_2:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:areg_128_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[V_ACCVGPR_WRITE_B32_e64_]], %subreg.sub1, [[V_ACCVGPR_WRITE_B32_e64_1]], %subreg.sub2, [[V_ACCVGPR_WRITE_B32_e64_2]], %subreg.sub3 + ; CHECK-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY4]], [[COPY4]], killed [[REG_SEQUENCE2]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: @@ -77,16 +80,16 @@ body: | ; COALESCE-NEXT: S_BITCMP1_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc ; COALESCE-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 0 ; COALESCE-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit killed $scc - ; COALESCE-NEXT: undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_128_align2 = V_MOV_B32_e32 0, implicit $exec ; COALESCE-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec ; COALESCE-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec ; COALESCE-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec + ; COALESCE-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub1:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec ; COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; COALESCE-NEXT: {{ $}} ; COALESCE-NEXT: bb.1: ; COALESCE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; COALESCE-NEXT: {{ $}} - ; COALESCE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0 + ; COALESCE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0 ; COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1 ; COALESCE-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc ; COALESCE-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc @@ -98,12 +101,12 @@ body: | ; COALESCE-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], 1, implicit-def dead $scc ; COALESCE-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_MOV_B32_1]], 31, implicit-def dead $scc ; COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc - ; COALESCE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub2:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1 - ; COALESCE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub3:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1 ; COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0 - ; COALESCE-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]] - ; COALESCE-NEXT: [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[V_MOV_B32_e32_]] - ; COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; COALESCE-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]] + ; COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY1]] + ; COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub1 + ; COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub1 + ; COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY2]], [[COPY2]], [[V_ACCVGPR_WRITE_B32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec ; COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0 ; COALESCE-NEXT: {{ $}} ; COALESCE-NEXT: bb.3: @@ -131,16 +134,16 @@ body: | ; GFX908-COALESCE-NEXT: S_BITCMP1_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc ; GFX908-COALESCE-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 0 ; GFX908-COALESCE-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit killed $scc - ; GFX908-COALESCE-NEXT: undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_128_align2 = V_MOV_B32_e32 0, implicit $exec ; GFX908-COALESCE-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec ; GFX908-COALESCE-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec ; GFX908-COALESCE-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec + ; GFX908-COALESCE-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub1:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec ; GFX908-COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX908-COALESCE-NEXT: {{ $}} ; GFX908-COALESCE-NEXT: bb.1: ; GFX908-COALESCE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GFX908-COALESCE-NEXT: {{ $}} - ; GFX908-COALESCE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0 + ; GFX908-COALESCE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0 ; GFX908-COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1 ; GFX908-COALESCE-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc ; GFX908-COALESCE-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc @@ -152,12 +155,12 @@ body: | ; GFX908-COALESCE-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], 1, implicit-def dead $scc ; GFX908-COALESCE-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_MOV_B32_1]], 31, implicit-def dead $scc ; GFX908-COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc - ; GFX908-COALESCE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub2:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1 - ; GFX908-COALESCE-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub3:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1 ; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0 - ; GFX908-COALESCE-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]] - ; GFX908-COALESCE-NEXT: [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[V_MOV_B32_e32_]] - ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX908-COALESCE-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]] + ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY1]] + ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub1 + ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub1 + ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY2]], [[COPY2]], [[V_ACCVGPR_WRITE_B32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec ; GFX908-COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0 ; GFX908-COALESCE-NEXT: {{ $}} ; GFX908-COALESCE-NEXT: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/remaining-virtual-register-operands.ll b/llvm/test/CodeGen/AMDGPU/remaining-virtual-register-operands.ll index 9279b44edac75..8383930ebfcd1 100644 --- a/llvm/test/CodeGen/AMDGPU/remaining-virtual-register-operands.ll +++ b/llvm/test/CodeGen/AMDGPU/remaining-virtual-register-operands.ll @@ -1,4 +1,6 @@ -; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs -filetype=null %s 2>&1 | FileCheck -implicit-check-not=error %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs -enable-misched=0 -filetype=null %s 2>&1 | FileCheck -implicit-check-not=error %s + +; Scheduler disabled to work around issue #129028 ; This testcase fails register allocation at the same time it performs ; virtual register splitting (by introducing VGPR to AGPR copies). We @@ -13,13 +15,16 @@ ; CHECK: error: :0:0: ran out of registers during register allocation define amdgpu_kernel void @alloc_failure_with_split_vregs(float %v0, float %v1) #0 { %agpr0 = call float asm sideeffect "; def $0", "=${a0}"() - %agpr.vec = insertelement <16 x float> undef, float %agpr0, i32 0 + %agpr.vec = insertelement <16 x float> zeroinitializer, float %agpr0, i32 0 %mfma0 = call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %v0, float %v1, <16 x float> %agpr.vec, i32 0, i32 0, i32 0) %mfma0.3 = extractelement <16 x float> %mfma0, i32 3 %insert = insertelement <16 x float> %mfma0, float %agpr0, i32 8 + %mfma1 = call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %v0, float %v1, <16 x float> %insert, i32 0, i32 0, i32 0) %mfma1.3 = extractelement <16 x float> %mfma1, i32 3 call void asm sideeffect "; use $0", "{a1}"(float %mfma1.3) + call void asm sideeffect "; use $0", "a"(<16 x float> %agpr.vec) + ret void } diff --git a/llvm/test/CodeGen/AMDGPU/schedule-xdl-resource.ll b/llvm/test/CodeGen/AMDGPU/schedule-xdl-resource.ll index a7e361b4b67cc..a703ce0f6064c 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-xdl-resource.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-xdl-resource.ll @@ -3,9 +3,8 @@ declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half>, <4 x half>, <32 x float>, i32, i32, i32) -; CHECK: CritRes: {{[0-9]+}} HWXDL -; CHECK: Picking: Cand SU([[nid:[0-9]+]]) RES-DEMAND -; CHECK: Scheduling SU([[nid]]) {{.*}} V_MFMA_F32_32X32X4F16 +; CHECK: Scheduling SU({{[0-9]+}}) {{.*}} V_MFMA_F32_32X32X4F16 +; CHECK: HWXDL +16x1u define amdgpu_kernel void @schedule-xdl-resource(ptr addrspace(1) %in, ptr addrspace(1) %out, ptr addrspace(3) %lds, i32 %stride) #0 { %in_ptr.1 = getelementptr <32 x float>, ptr addrspace(1) %in, i32 %stride %in_ptr.2 = getelementptr <32 x float>, ptr addrspace(1) %in_ptr.1, i32 %stride diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-operands-agpr-copy-reg-sequence.mir b/llvm/test/CodeGen/AMDGPU/si-fold-operands-agpr-copy-reg-sequence.mir index 9d167f578e9eb..80f13b5102097 100644 --- a/llvm/test/CodeGen/AMDGPU/si-fold-operands-agpr-copy-reg-sequence.mir +++ b/llvm/test/CodeGen/AMDGPU/si-fold-operands-agpr-copy-reg-sequence.mir @@ -10,8 +10,12 @@ body: | ; CHECK-LABEL: name: s_mov_b32_0_copy_vgpr_reg_sequence_128_splat_copy_to_agpr ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY [[REG_SEQUENCE]] - ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY]] + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_2:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_3:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_e64_]], %subreg.sub0, [[V_ACCVGPR_WRITE_B32_e64_1]], %subreg.sub1, [[V_ACCVGPR_WRITE_B32_e64_2]], %subreg.sub2, [[V_ACCVGPR_WRITE_B32_e64_3]], %subreg.sub3 + ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 %0:sgpr_32 = S_MOV_B32 0 %1:vgpr_32 = COPY killed %0 @@ -30,8 +34,12 @@ body: | ; CHECK-LABEL: name: v_mov_b32_0_vgpr_reg_sequence_128_splat_copy_to_agpr ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed [[REG_SEQUENCE]] - ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY]] + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_2:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_3:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_e64_]], %subreg.sub0, [[V_ACCVGPR_WRITE_B32_e64_1]], %subreg.sub1, [[V_ACCVGPR_WRITE_B32_e64_2]], %subreg.sub2, [[V_ACCVGPR_WRITE_B32_e64_3]], %subreg.sub3 + ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec %1:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1, %0, %subreg.sub2, %0, %subreg.sub3 @@ -50,8 +58,12 @@ body: | ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3 ; CHECK-NEXT: S_NOP 0, implicit [[REG_SEQUENCE]] - ; CHECK-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed [[REG_SEQUENCE]] - ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY]] + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_2:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_3:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_e64_]], %subreg.sub0, [[V_ACCVGPR_WRITE_B32_e64_1]], %subreg.sub1, [[V_ACCVGPR_WRITE_B32_e64_2]], %subreg.sub2, [[V_ACCVGPR_WRITE_B32_e64_3]], %subreg.sub3 + ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec %1:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1, %0, %subreg.sub2, %0, %subreg.sub3 @@ -70,8 +82,8 @@ body: | ; CHECK-LABEL: name: s_mov_b32_literal_copy_vgpr_reg_sequence_128_splat_copy_to_agpr ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 999, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY [[REG_SEQUENCE]] - ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY]] + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3 + ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 %0:sgpr_32 = S_MOV_B32 999 %1:vgpr_32 = COPY %0 @@ -90,8 +102,8 @@ body: | ; CHECK-LABEL: name: v_mov_b32_literal_vgpr_reg_sequence_128_splat_copy_to_agpr ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 999, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed [[REG_SEQUENCE]] - ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY]] + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3 + ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 %0:vgpr_32 = V_MOV_B32_e32 999, implicit $exec %1:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1, %0, %subreg.sub2, %0, %subreg.sub3 @@ -110,8 +122,8 @@ body: | ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 999, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3 ; CHECK-NEXT: S_NOP 0, implicit [[REG_SEQUENCE]] - ; CHECK-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed [[REG_SEQUENCE]] - ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY]] + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3 + ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 %0:vgpr_32 = V_MOV_B32_e32 999, implicit $exec %1:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1, %0, %subreg.sub2, %0, %subreg.sub3 @@ -205,12 +217,7 @@ body: | bb.0: ; CHECK-LABEL: name: s_mov_b32_999_splat_sgpr_128_copy_vgpr_copy_agpr ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 999 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:agpr_32 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:agpr_32 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:agpr_32 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:agpr_32 = COPY [[COPY]] - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3 ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE]] ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 %0:sgpr_32 = S_MOV_B32 999 @@ -231,11 +238,8 @@ body: | ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 999 ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1 ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:agpr_32 = COPY [[COPY]] - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:agpr_32 = COPY [[COPY]] ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_128 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_e64_]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[V_ACCVGPR_WRITE_B32_e64_1]], %subreg.sub3 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_128 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_e64_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[V_ACCVGPR_WRITE_B32_e64_1]], %subreg.sub3 ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE]] ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 %0:sgpr_32 = S_MOV_B32 999 @@ -256,8 +260,12 @@ body: | ; CHECK-LABEL: name: s_mov_b32_0_splat_sgpr_128_copy_agpr ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY [[REG_SEQUENCE]] - ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY]] + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_2:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_3:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_e64_]], %subreg.sub0, [[V_ACCVGPR_WRITE_B32_e64_1]], %subreg.sub1, [[V_ACCVGPR_WRITE_B32_e64_2]], %subreg.sub2, [[V_ACCVGPR_WRITE_B32_e64_3]], %subreg.sub3 + ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 %0:sgpr_32 = S_MOV_B32 0 %1:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1, %0, %subreg.sub2, %0, %subreg.sub3 @@ -277,8 +285,10 @@ body: | ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 999, implicit $exec ; CHECK-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1, [[V_MOV_B32_e32_2]], %subreg.sub2, [[V_MOV_B32_e32_1]], %subreg.sub3 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed [[REG_SEQUENCE]] - ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY]] + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 8, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1, [[V_ACCVGPR_WRITE_B32_e64_1]], %subreg.sub2, [[V_MOV_B32_e32_1]], %subreg.sub3 + ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec %1:vgpr_32 = V_MOV_B32_e32 999, implicit $exec @@ -298,8 +308,9 @@ body: | ; CHECK-LABEL: name: s_mov_b64_0_copy_vgpr_reg_sequence_128_splat_copy_to_agpr_elt64 ; CHECK: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_MOV_B]], %subreg.sub0_sub1, [[V_MOV_B]], %subreg.sub2_sub3 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY [[REG_SEQUENCE]] - ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY]] + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY [[V_MOV_B]] + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2_sub3 + ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 %0:sreg_64 = S_MOV_B64 0 %1:vreg_64_align2 = COPY killed %0 @@ -318,8 +329,9 @@ body: | ; CHECK-LABEL: name: s_mov_b64_0_copy_vgpr_reg_sequence_128_splat_copy_to_agpr_subreg_elt32 ; CHECK: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_MOV_B]].sub0, %subreg.sub0, [[V_MOV_B]].sub1, %subreg.sub1, [[V_MOV_B]], %subreg.sub1_sub2 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY [[REG_SEQUENCE]] - ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY]] + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY [[V_MOV_B]] + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[V_MOV_B]].sub0, %subreg.sub0, [[V_MOV_B]].sub1, %subreg.sub1, [[COPY]], %subreg.sub1_sub2 + ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 %0:sreg_64 = S_MOV_B64 0 %1:vreg_64_align2 = COPY killed %0 @@ -342,8 +354,10 @@ body: | ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_MOV_B]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2_sub3 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128 = COPY [[REG_SEQUENCE]] - ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[V_MOV_B]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[COPY]] + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 + ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 %0:sreg_64 = S_MOV_B64 0 %1:vreg_64_align2 = COPY killed %0 @@ -367,8 +381,9 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[COPY]] ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY1]], %subreg.sub2_sub3 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:areg_128 = COPY [[REG_SEQUENCE]] - ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY2]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[COPY1]] + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 + ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 %0:sreg_64 = COPY $sgpr8_sgpr9 %1:vreg_64_align2 = COPY killed %0 @@ -392,8 +407,10 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $sgpr10_sgpr11 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_192 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3, [[COPY1]], %subreg.sub4_sub5 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:areg_192 = COPY [[REG_SEQUENCE]] - ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = COPY [[COPY2]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_192 = REG_SEQUENCE [[COPY2]], %subreg.sub0_sub1, [[COPY3]], %subreg.sub2_sub3, [[COPY2]], %subreg.sub4_sub5 + ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 %0:sreg_64 = COPY $sgpr8_sgpr9 %1:sreg_64 = COPY $sgpr10_sgpr11 @@ -418,8 +435,10 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr10 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr11 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_192 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3, [[COPY2]], %subreg.sub4, [[COPY2]], %subreg.sub5 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_192 = COPY [[REG_SEQUENCE]] - ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = COPY [[COPY3]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[COPY2]] + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_192 = REG_SEQUENCE [[COPY3]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2_sub3, [[COPY4]], %subreg.sub4, [[COPY4]], %subreg.sub5 + ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 %0:sreg_64 = COPY $sgpr8_sgpr9 %1:sreg_32 = COPY $sgpr10 @@ -492,8 +511,11 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr10 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr11 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_192 = REG_SEQUENCE [[COPY]], %subreg.sub3, [[COPY1]], %subreg.sub2, [[COPY2]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:areg_192 = COPY [[REG_SEQUENCE]] - ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_192 = REG_SEQUENCE [[COPY4]], %subreg.sub3, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 %0:sreg_32 = COPY $sgpr8 %1:sreg_32 = COPY $sgpr9 @@ -519,8 +541,10 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr8 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr9 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_192 = REG_SEQUENCE [[COPY]], %subreg.sub3, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:areg_192 = COPY [[REG_SEQUENCE]] - ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY2]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_192 = REG_SEQUENCE [[COPY2]], %subreg.sub3, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 %0:sreg_32 = COPY $sgpr8 %1:sreg_32 = COPY $sgpr9 @@ -544,9 +568,11 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr8 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr9 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_192 = REG_SEQUENCE [[COPY]], %subreg.sub3, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:areg_192 = COPY [[REG_SEQUENCE]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_192 = REG_SEQUENCE [[COPY2]], %subreg.sub3, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 ; CHECK-NEXT: S_NOP 0, implicit [[COPY]] - ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY2]] + ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 %0:sreg_32 = COPY $sgpr8 %1:sreg_32 = COPY $sgpr9 @@ -573,8 +599,10 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr8 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr9 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:areg_128 = COPY [[REG_SEQUENCE]] - ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 + ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 @@ -602,8 +630,11 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr9 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr10 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:areg_128 = COPY [[REG_SEQUENCE]] - ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3 + ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 %0:vgpr_32 = COPY $vgpr0 %1:sreg_32 = COPY $sgpr8 @@ -631,8 +662,9 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr8 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:areg_128 = COPY [[REG_SEQUENCE]] - ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 @@ -654,8 +686,12 @@ body: | ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY [[REG_SEQUENCE]] - ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY]] + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_2:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_3:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_e64_]], %subreg.sub0, [[V_ACCVGPR_WRITE_B32_e64_1]], %subreg.sub1, [[V_ACCVGPR_WRITE_B32_e64_2]], %subreg.sub2, [[V_ACCVGPR_WRITE_B32_e64_3]], %subreg.sub3 + ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 %0:sgpr_32 = S_MOV_B32 0 %1:vgpr_32 = V_MOV_B32_e32 1, implicit $exec @@ -675,8 +711,11 @@ body: | ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 999 ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY [[REG_SEQUENCE]] - ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY]] + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_ACCVGPR_WRITE_B32_e64_]], %subreg.sub1, [[COPY]], %subreg.sub2, [[V_ACCVGPR_WRITE_B32_e64_1]], %subreg.sub3 + ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 %0:sgpr_32 = S_MOV_B32 999 %1:vgpr_32 = V_MOV_B32_e32 1, implicit $exec @@ -696,8 +735,10 @@ body: | ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 999 ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY [[REG_SEQUENCE]] - ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY]] + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_e64_]], %subreg.sub0, [[COPY]], %subreg.sub1, [[COPY]], %subreg.sub2, [[COPY]], %subreg.sub3 + ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 %0:sgpr_32 = S_MOV_B32 999 %1:vgpr_32 = V_MOV_B32_e32 1, implicit $exec @@ -717,8 +758,9 @@ body: | ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 8 ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 123, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY [[REG_SEQUENCE]] - ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY]] + ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 8, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[V_ACCVGPR_WRITE_B32_e64_]], %subreg.sub3 + ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]] ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 %0:sgpr_32 = S_MOV_B32 8 %1:vgpr_32 = V_MOV_B32_e32 123, implicit $exec