diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index b77da4d612dd4..de89ce8f5b8ef 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -257,6 +257,7 @@ class SIFoldOperandsImpl { std::pair isOMod(const MachineInstr &MI) const; bool tryFoldOMod(MachineInstr &MI); bool tryFoldRegSequence(MachineInstr &MI); + bool tryFoldImmRegSequence(MachineInstr &MI); bool tryFoldPhiAGPR(MachineInstr &MI); bool tryFoldLoad(MachineInstr &MI); @@ -2331,6 +2332,114 @@ bool SIFoldOperandsImpl::tryFoldOMod(MachineInstr &MI) { return true; } +// gfx942+ can use V_MOV_B64 for materializing constant immediates. +// For example: +// %0:vgpr_32 = V_MOV_B32 0, implicit $exec +// %1:vreg_64_align2 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1 +// -> +// %1:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec +bool SIFoldOperandsImpl::tryFoldImmRegSequence(MachineInstr &MI) { + assert(MI.isRegSequence() && + "MachineInstr is not expected REG_SEQUENCE instruction"); + Register Reg = MI.getOperand(0).getReg(); + const TargetRegisterClass *DefRC = MRI->getRegClass(Reg); + const MCInstrDesc &MovDesc = TII->get(AMDGPU::V_MOV_B64_PSEUDO); + const TargetRegisterClass *RC = + TII->getRegClass(MovDesc, 0, TRI, *MI.getMF()); + + if (!ST->hasMovB64() || !TRI->isVGPR(*MRI, Reg) || + !MRI->hasOneNonDBGUse(Reg) || + (!TRI->getCompatibleSubRegClass(DefRC, RC, AMDGPU::sub0_sub1) && + DefRC != RC)) + return false; + + SmallVector, 32> Defs; + if (!getRegSeqInit(Defs, Reg)) + return false; + + // Only attempt to fold immediate materializations. + if (!Defs.empty() && + llvm::any_of(Defs, [](const std::pair &Op) { + return !Op.first->isImm(); + })) + return false; + + SmallVector ImmVals; + uint64_t ImmVal = 0; + uint64_t ImmSize = 0; + uint64_t RemainderSize = TRI->getRegSizeInBits(*DefRC); + SmallVector, 4> Remainders; + for (auto &[Op, SubIdx] : Defs) { + unsigned SubRegSize = TRI->getSubRegIdxSize(SubIdx); + unsigned Shift = (TRI->getChannelFromSubReg(SubIdx) % 2) * SubRegSize; + ImmSize += SubRegSize; + ImmVal |= Op->getImm() << Shift; + + if (SubRegSize == 64) + return false; + + if (ImmSize == 64) { + // Only 32 bit literals can be encoded. + if (!isUInt<32>(ImmVal)) + return false; + ImmVals.push_back(ImmVal); + ImmVal = 0; + ImmSize = 0; + RemainderSize -= 64; + } else if ((RemainderSize / 64 == 0) && (RemainderSize % 64 != 0)) { + // There's some remainder to consider. + Remainders.push_back({Op, SubRegSize}); + } + } + + // Can only combine REG_SEQUENCE into one 64b immediate materialization mov. + if (DefRC == RC) { + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), MovDesc, Reg) + .addImm(ImmVals[0]); + MI.eraseFromParent(); + return true; + } + + if (ImmVals.size() == 1 && RemainderSize == 0) + return false; + + // Can't bail from here on out: modifying the MI. + + // Remove source operands. + for (unsigned i = MI.getNumOperands() - 1; i > 0; --i) + MI.removeOperand(i); + + unsigned Ch = 0; + for (uint64_t Val : ImmVals) { + Register MovReg = MRI->createVirtualRegister(RC); + // Duplicate vmov imm materializations (e.g., splatted operands) should get + // combined by MachineCSE pass. + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), + TII->get(AMDGPU::V_MOV_B64_PSEUDO), MovReg) + .addImm(Val); + + // 2 subregs with no overlap (i.e., sub0_sub1, sub2_sub3, etc.). + unsigned SubReg64B = + SIRegisterInfo::getSubRegFromChannel(/*Channel=*/Ch * 2, /*SubRegs=*/2); + + MI.addOperand(MachineOperand::CreateReg(MovReg, /*isDef=*/false)); + MI.addOperand(MachineOperand::CreateImm(SubReg64B)); + ++Ch; + } + Ch *= 2; + for (auto &[Op, Size] : Remainders) { + unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(/*Channel=*/Ch); + MachineOperand &Mov = Op->getParent()->getOperand(0); + MI.addOperand(MachineOperand::CreateReg(Mov.getReg(), /*isDef=*/false)); + MI.addOperand(MachineOperand::CreateImm(SubReg)); + ++Ch; + } + + LLVM_DEBUG(dbgs() << "Folded into " << MI); + + return true; +} + // Try to fold a reg_sequence with vgpr output and agpr inputs into an // instruction which can take an agpr. So far that means a store. bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) { @@ -2760,9 +2869,11 @@ bool SIFoldOperandsImpl::run(MachineFunction &MF) { continue; } - if (MI.isRegSequence() && tryFoldRegSequence(MI)) { - Changed = true; - continue; + if (MI.isRegSequence()) { + if (tryFoldImmRegSequence(MI) || tryFoldRegSequence(MI)) { + Changed = true; + continue; + } } if (MI.isPHI() && tryFoldPhiAGPR(MI)) { diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index b5e579b78a59c..a43d9657cfb24 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -4139,8 +4139,7 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) { ; GFX942-LABEL: store_load_i64_aligned: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 15 -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], 15 ; GFX942-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 @@ -4250,8 +4249,7 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) { ; GFX942-LABEL: store_load_i64_unaligned: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, 15 -; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], 15 ; GFX942-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 @@ -5010,10 +5008,8 @@ define amdgpu_ps void @large_offset() { ; ; GFX942-LABEL: large_offset: ; GFX942: ; %bb.0: ; %bb -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:3024 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: scratch_load_dwordx4 v[0:3], off, off offset:3024 sc0 sc1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll index ff77d5ccbe312..47e7bbeddb298 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll @@ -75,10 +75,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) # ; GFX942-LABEL: test_mfma_f32_32x32x4bf16_1k: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v1, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, 1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v0, 2 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 1 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], 2 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 @@ -116,18 +115,18 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) # ; GFX942-NEXT: v_accvgpr_write_b32 a30, s14 ; GFX942-NEXT: v_accvgpr_write_b32 a31, s15 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x4_2b_bf16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mfma_f32_32x32x4_2b_bf16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 2 -; GFX942-NEXT: global_store_dwordx4 v1, a[24:27], s[34:35] offset:96 -; GFX942-NEXT: global_store_dwordx4 v1, a[28:31], s[34:35] offset:112 -; GFX942-NEXT: global_store_dwordx4 v1, a[16:19], s[34:35] offset:64 -; GFX942-NEXT: global_store_dwordx4 v1, a[20:23], s[34:35] offset:80 -; GFX942-NEXT: global_store_dwordx4 v1, a[8:11], s[34:35] offset:32 -; GFX942-NEXT: global_store_dwordx4 v1, a[12:15], s[34:35] offset:48 -; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[34:35] -; GFX942-NEXT: global_store_dwordx4 v1, a[4:7], s[34:35] offset:16 +; GFX942-NEXT: global_store_dwordx4 v4, a[24:27], s[34:35] offset:96 +; GFX942-NEXT: global_store_dwordx4 v4, a[28:31], s[34:35] offset:112 +; GFX942-NEXT: global_store_dwordx4 v4, a[16:19], s[34:35] offset:64 +; GFX942-NEXT: global_store_dwordx4 v4, a[20:23], s[34:35] offset:80 +; GFX942-NEXT: global_store_dwordx4 v4, a[8:11], s[34:35] offset:32 +; GFX942-NEXT: global_store_dwordx4 v4, a[12:15], s[34:35] offset:48 +; GFX942-NEXT: global_store_dwordx4 v4, a[0:3], s[34:35] +; GFX942-NEXT: global_store_dwordx4 v4, a[4:7], s[34:35] offset:16 ; GFX942-NEXT: s_endpgm ; ; GFX90A-VGPR-LABEL: test_mfma_f32_32x32x4bf16_1k: @@ -191,10 +190,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) # ; GFX942-VGPR-LABEL: test_mfma_f32_32x32x4bf16_1k: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v33, 0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, 1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v35, v33 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 2 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[32:33], 1 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[34:35], 2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v36, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 @@ -232,18 +230,18 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) # ; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, s14 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, s15 ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_32x32x4_2b_bf16 v[0:31], v[34:35], v[32:33], v[0:31] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_f32_32x32x4_2b_bf16 v[0:31], v[32:33], v[34:35], v[0:31] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 2 -; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[24:27], s[34:35] offset:96 -; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[28:31], s[34:35] offset:112 -; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[16:19], s[34:35] offset:64 -; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[20:23], s[34:35] offset:80 -; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[8:11], s[34:35] offset:32 -; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[12:15], s[34:35] offset:48 -; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[0:3], s[34:35] -; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[4:7], s[34:35] offset:16 +; GFX942-VGPR-NEXT: global_store_dwordx4 v36, v[24:27], s[34:35] offset:96 +; GFX942-VGPR-NEXT: global_store_dwordx4 v36, v[28:31], s[34:35] offset:112 +; GFX942-VGPR-NEXT: global_store_dwordx4 v36, v[16:19], s[34:35] offset:64 +; GFX942-VGPR-NEXT: global_store_dwordx4 v36, v[20:23], s[34:35] offset:80 +; GFX942-VGPR-NEXT: global_store_dwordx4 v36, v[8:11], s[34:35] offset:32 +; GFX942-VGPR-NEXT: global_store_dwordx4 v36, v[12:15], s[34:35] offset:48 +; GFX942-VGPR-NEXT: global_store_dwordx4 v36, v[0:3], s[34:35] +; GFX942-VGPR-NEXT: global_store_dwordx4 v36, v[4:7], s[34:35] offset:16 ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg @@ -294,10 +292,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) # ; GFX942-LABEL: test_mfma_f32_16x16x4bf16_1k: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v1, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, 1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v0, 2 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 1 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], 2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) @@ -318,13 +314,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) # ; GFX942-NEXT: v_accvgpr_write_b32 a14, s14 ; GFX942-NEXT: v_accvgpr_write_b32 a15, s15 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_16x16x4_4b_bf16 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mfma_f32_16x16x4_4b_bf16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 2 -; GFX942-NEXT: global_store_dwordx4 v1, a[12:15], s[16:17] offset:48 -; GFX942-NEXT: global_store_dwordx4 v1, a[8:11], s[16:17] offset:32 -; GFX942-NEXT: global_store_dwordx4 v1, a[4:7], s[16:17] offset:16 -; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[16:17] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 +; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GFX942-NEXT: s_endpgm ; ; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x4bf16_1k: @@ -358,10 +355,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) # ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x4bf16_1k: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, 1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, v17 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 2 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[16:17], 1 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[18:19], 2 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) @@ -374,13 +369,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) # ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_16x16x4_4b_bf16 v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_f32_16x16x4_4b_bf16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 2 -; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48 -; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32 -; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16 -; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[0:3], s[16:17] +; GFX942-VGPR-NEXT: s_nop 1 +; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg @@ -415,10 +411,9 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0 ; GFX942-LABEL: test_mfma_f32_4x4x4bf16_1k: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v1, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, 1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v0, 2 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 1 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], 2 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) @@ -427,9 +422,9 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0 ; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_4x4x4_16b_bf16 a[0:3], v[2:3], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mfma_f32_4x4x4_16b_bf16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 4 -; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] +; GFX942-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] ; GFX942-NEXT: s_endpgm ; ; GFX90A-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k: @@ -453,19 +448,18 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 2 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], 1 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], 2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_4x4x4_16b_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_f32_4x4x4_16b_bf16 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 4 -; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -517,10 +511,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) # ; GFX942-LABEL: test_mfma_f32_32x32x8bf16_1k: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v1, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, 1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v0, 2 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 1 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], 2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) @@ -541,13 +533,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) # ; GFX942-NEXT: v_accvgpr_write_b32 a14, s14 ; GFX942-NEXT: v_accvgpr_write_b32 a15, s15 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x8_bf16 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mfma_f32_32x32x8_bf16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 2 -; GFX942-NEXT: global_store_dwordx4 v1, a[12:15], s[16:17] offset:48 -; GFX942-NEXT: global_store_dwordx4 v1, a[8:11], s[16:17] offset:32 -; GFX942-NEXT: global_store_dwordx4 v1, a[4:7], s[16:17] offset:16 -; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[16:17] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 +; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 +; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GFX942-NEXT: s_endpgm ; ; GFX90A-VGPR-LABEL: test_mfma_f32_32x32x8bf16_1k: @@ -582,10 +575,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) # ; GFX942-VGPR-LABEL: test_mfma_f32_32x32x8bf16_1k: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, 1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, v17 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 2 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[16:17], 1 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[18:19], 2 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) @@ -598,13 +589,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) # ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_bf16 v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_bf16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 2 -; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48 -; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32 -; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16 -; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[0:3], s[16:17] +; GFX942-VGPR-NEXT: s_nop 1 +; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg @@ -640,10 +632,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) ; GFX942-LABEL: test_mfma_f32_16x16x16bf16_1k: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v1, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, 1 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v0, 2 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 1 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], 2 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) @@ -652,9 +643,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) ; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_16x16x16_bf16 a[0:3], v[2:3], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mfma_f32_16x16x16_bf16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 6 -; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] +; GFX942-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] ; GFX942-NEXT: s_endpgm ; ; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k: @@ -679,19 +670,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v5 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 2 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], 1 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], 2 +; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: s_nop 1 -; GFX942-VGPR-NEXT: v_mfma_f32_16x16x16_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPR-NEXT: v_mfma_f32_16x16x16_bf16 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 6 -; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] +; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll index 78be949baabac..9e88b771328ac 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -5626,70 +5626,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) # ; ; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_imm: ; GFX942-VGPR: ; %bb.0: ; %bb -; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 0 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v20, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v21, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v22, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v23, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v24, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v25, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v26, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v27, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v28, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v29, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, v1 -; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, v1 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[32:33], v[30:31] -; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, 2.0 -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[30:31], v[28:29] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[28:29], v[26:27] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[26:27], v[24:25] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[24:25], v[22:23] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[22:23], v[20:21] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[20:21], v[18:19] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[18:19], v[16:17] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[16:17], v[14:15] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], v[12:13] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], v[10:11] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], v[8:9] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX942-VGPR-NEXT: v_mov_b32_e32 v33, 1.0 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], 0 +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], 0x3f800000 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3] -; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], v[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], v[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], v[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[16:17], v[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[18:19], v[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[20:21], v[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[22:23], v[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[24:25], v[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[26:27], v[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[28:29], v[2:3] +; GFX942-VGPR-NEXT: v_mov_b64_e32 v[30:31], v[2:3] +; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, 2.0 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942-VGPR-NEXT: s_nop 0 -; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[2:33], v0, v34, v[2:33] +; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 0 +; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 0 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[30:33], s[0:1] offset:112 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[26:29], s[0:1] offset:96 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[22:25], s[0:1] offset:80 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[18:21], s[0:1] offset:64 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[14:17], s[0:1] offset:48 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[10:13], s[0:1] offset:32 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[6:9], s[0:1] offset:16 -; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[2:5], s[0:1] +; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 +; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 +; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 +; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 +; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 +; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 +; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 +; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> , i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll b/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll index 3b855a56a5abb..045757e5a8b71 100644 --- a/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll +++ b/llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll @@ -7,12 +7,12 @@ define <2 x i32> @uniform_masked_load_ptr1_mask_v2i32(ptr addrspace(1) inreg noc ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0 ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_cbranch_execz .LBB0_2 ; GFX942-NEXT: ; %bb.1: ; %cond.load -; GFX942-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] +; GFX942-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] ; GFX942-NEXT: .LBB0_2: ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) @@ -24,24 +24,46 @@ entry: ret <2 x i32> %result } -define <4 x i32> @uniform_masked_load_ptr1_mask_v4i32(ptr addrspace(1) inreg nocapture readonly %ptr, i1 %mask) { -; GFX942-LABEL: uniform_masked_load_ptr1_mask_v4i32: +define <3 x i32> @uniform_masked_load_ptr1_mask_v3i32(ptr addrspace(1) inreg nocapture readonly %ptr, i1 %mask) { +; GFX942-LABEL: uniform_masked_load_ptr1_mask_v3i32: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0 ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_cbranch_execz .LBB1_2 ; GFX942-NEXT: ; %bb.1: ; %cond.load -; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1] +; GFX942-NEXT: global_load_dwordx3 v[0:2], v2, s[0:1] ; GFX942-NEXT: .LBB1_2: ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] +entry: + %partialmaskvec = insertelement <3 x i1> poison, i1 %mask, i64 0 + %maskvec = shufflevector <3 x i1> %partialmaskvec, <3 x i1> poison, <3 x i32> zeroinitializer + %result = tail call <3 x i32> @llvm.masked.load.v3i32.p1(ptr addrspace(1) %ptr, i32 2, <3 x i1> %maskvec, <3 x i32> zeroinitializer) + ret <3 x i32> %result +} + +define <4 x i32> @uniform_masked_load_ptr1_mask_v4i32(ptr addrspace(1) inreg nocapture readonly %ptr, i1 %mask) { +; GFX942-LABEL: uniform_masked_load_ptr1_mask_v4i32: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-NEXT: s_cbranch_execz .LBB2_2 +; GFX942-NEXT: ; %bb.1: ; %cond.load +; GFX942-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] +; GFX942-NEXT: .LBB2_2: +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] entry: %partialmaskvec = insertelement <4 x i1> poison, i1 %mask, i64 0 %maskvec = shufflevector <4 x i1> %partialmaskvec, <4 x i1> poison, <4 x i32> zeroinitializer @@ -49,21 +71,50 @@ entry: ret <4 x i32> %result } +define <5 x i32> @uniform_masked_load_ptr1_mask_v5i32(ptr addrspace(1) inreg nocapture readonly %ptr, i1 %mask) { +; GFX942-LABEL: uniform_masked_load_ptr1_mask_v5i32: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-NEXT: s_cbranch_execz .LBB3_2 +; GFX942-NEXT: ; %bb.1: ; %cond.load +; GFX942-NEXT: global_load_dword v10, v4, s[0:1] offset:16 +; GFX942-NEXT: global_load_dwordx4 v[6:9], v4, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v7 +; GFX942-NEXT: v_mov_b32_e32 v2, v8 +; GFX942-NEXT: v_mov_b32_e32 v3, v9 +; GFX942-NEXT: v_mov_b32_e32 v4, v10 +; GFX942-NEXT: .LBB3_2: +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_setpc_b64 s[30:31] +entry: + %partialmaskvec = insertelement <5 x i1> poison, i1 %mask, i64 0 + %maskvec = shufflevector <5 x i1> %partialmaskvec, <5 x i1> poison, <5 x i32> zeroinitializer + %result = tail call <5 x i32> @llvm.masked.load.v5i32.p1(ptr addrspace(1) %ptr, i32 2, <5 x i1> %maskvec, <5 x i32> zeroinitializer) + ret <5 x i32> %result +} + define <4 x float> @uniform_masked_load_ptr1_mask_v4f32(ptr addrspace(1) inreg nocapture readonly %ptr, i1 %mask) { ; GFX942-LABEL: uniform_masked_load_ptr1_mask_v4f32: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX942-NEXT: s_cbranch_execz .LBB2_2 +; GFX942-NEXT: s_cbranch_execz .LBB4_2 ; GFX942-NEXT: ; %bb.1: ; %cond.load -; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1] -; GFX942-NEXT: .LBB2_2: +; GFX942-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] +; GFX942-NEXT: .LBB4_2: ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -74,27 +125,49 @@ entry: ret <4 x float> %result } +define <6 x i32> @uniform_masked_load_ptr1_mask_v6i32(ptr addrspace(1) inreg nocapture readonly %ptr, i1 %mask) { +; GFX942-LABEL: uniform_masked_load_ptr1_mask_v6i32: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-NEXT: s_cbranch_execz .LBB5_2 +; GFX942-NEXT: ; %bb.1: ; %cond.load +; GFX942-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1] offset:16 +; GFX942-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1] +; GFX942-NEXT: .LBB5_2: +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] +entry: + %partialmaskvec = insertelement <6 x i1> poison, i1 %mask, i64 0 + %maskvec = shufflevector <6 x i1> %partialmaskvec, <6 x i1> poison, <6 x i32> zeroinitializer + %result = tail call <6 x i32> @llvm.masked.load.v6i32.p1(ptr addrspace(1) %ptr, i32 2, <6 x i1> %maskvec, <6 x i32> zeroinitializer) + ret <6 x i32> %result +} + define <8 x i32> @uniform_masked_load_ptr1_mask_v8i32(ptr addrspace(1) inreg nocapture readonly %ptr, i1 %mask) { ; GFX942-LABEL: uniform_masked_load_ptr1_mask_v8i32: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX942-NEXT: s_cbranch_execz .LBB3_2 +; GFX942-NEXT: s_cbranch_execz .LBB6_2 ; GFX942-NEXT: ; %bb.1: ; %cond.load -; GFX942-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1] -; GFX942-NEXT: .LBB3_2: +; GFX942-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16 +; GFX942-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1] +; GFX942-NEXT: .LBB6_2: ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -111,21 +184,17 @@ define <8 x float> @uniform_masked_load_ptr1_mask_v8f32(ptr addrspace(1) inreg n ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v0 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v7, v0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX942-NEXT: s_cbranch_execz .LBB4_2 +; GFX942-NEXT: s_cbranch_execz .LBB7_2 ; GFX942-NEXT: ; %bb.1: ; %cond.load -; GFX942-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1] -; GFX942-NEXT: .LBB4_2: +; GFX942-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16 +; GFX942-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1] +; GFX942-NEXT: .LBB7_2: ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -142,15 +211,14 @@ define <8 x i16> @uniform_masked_load_ptr1_mask_v8i16(ptr addrspace(1) inreg noc ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX942-NEXT: s_cbranch_execz .LBB5_2 +; GFX942-NEXT: s_cbranch_execz .LBB8_2 ; GFX942-NEXT: ; %bb.1: ; %cond.load -; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1] -; GFX942-NEXT: .LBB5_2: +; GFX942-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] +; GFX942-NEXT: .LBB8_2: ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -161,8 +229,8 @@ entry: ret <8 x i16> %result } -define <8 x half> @uniform_masked_load_ptr1_mask_v8f16(ptr addrspace(1) inreg nocapture readonly %ptr, i1 %mask) { -; GFX942-LABEL: uniform_masked_load_ptr1_mask_v8f16: +define <10 x i16> @uniform_masked_load_ptr1_mask_v10i16(ptr addrspace(1) inreg nocapture readonly %ptr, i1 %mask) { +; GFX942-LABEL: uniform_masked_load_ptr1_mask_v10i16: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v0, 1, v0 @@ -171,11 +239,38 @@ define <8 x half> @uniform_masked_load_ptr1_mask_v8f16(ptr addrspace(1) inreg no ; GFX942-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX942-NEXT: s_cbranch_execz .LBB6_2 +; GFX942-NEXT: s_cbranch_execz .LBB9_2 ; GFX942-NEXT: ; %bb.1: ; %cond.load +; GFX942-NEXT: global_load_dword v4, v0, s[0:1] offset:16 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1] -; GFX942-NEXT: .LBB6_2: +; GFX942-NEXT: .LBB9_2: +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] +entry: + %partialmaskvec = insertelement <10 x i1> poison, i1 %mask, i16 0 + %maskvec = shufflevector <10 x i1> %partialmaskvec, <10 x i1> poison, <10 x i32> zeroinitializer + %result = tail call <10 x i16> @llvm.masked.load.v10i16.p1(ptr addrspace(1) %ptr, i32 4, <10 x i1> %maskvec, <10 x i16> zeroinitializer) + ret <10 x i16> %result +} + +define <8 x half> @uniform_masked_load_ptr1_mask_v8f16(ptr addrspace(1) inreg nocapture readonly %ptr, i1 %mask) { +; GFX942-LABEL: uniform_masked_load_ptr1_mask_v8f16: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-NEXT: s_cbranch_execz .LBB10_2 +; GFX942-NEXT: ; %bb.1: ; %cond.load +; GFX942-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] +; GFX942-NEXT: .LBB10_2: ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -192,15 +287,14 @@ define <8 x bfloat> @uniform_masked_load_ptr1_mask_v8bf16(ptr addrspace(1) inreg ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX942-NEXT: s_cbranch_execz .LBB7_2 +; GFX942-NEXT: s_cbranch_execz .LBB11_2 ; GFX942-NEXT: ; %bb.1: ; %cond.load -; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1] -; GFX942-NEXT: .LBB7_2: +; GFX942-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] +; GFX942-NEXT: .LBB11_2: ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] @@ -234,7 +328,7 @@ define <16 x i8> @uniform_masked_load_ptr1_mask_v16i8(ptr addrspace(1) inreg noc ; GFX942-NEXT: v_mov_b32_e32 v13, 0 ; GFX942-NEXT: v_mov_b32_e32 v14, 0 ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX942-NEXT: s_cbranch_execz .LBB8_2 +; GFX942-NEXT: s_cbranch_execz .LBB12_2 ; GFX942-NEXT: ; %bb.1: ; %cond.load ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: global_load_dwordx4 v[16:19], v0, s[0:1] @@ -251,7 +345,7 @@ define <16 x i8> @uniform_masked_load_ptr1_mask_v16i8(ptr addrspace(1) inreg noc ; GFX942-NEXT: v_lshrrev_b32_e32 v3, 24, v16 ; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v16 ; GFX942-NEXT: v_lshrrev_b32_e32 v1, 8, v16 -; GFX942-NEXT: .LBB8_2: +; GFX942-NEXT: .LBB12_2: ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: v_mov_b32_e32 v0, v16 ; GFX942-NEXT: v_mov_b32_e32 v4, v17 diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll index 6110b3101020a..d8454c900d23a 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -2498,39 +2498,40 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) ; ; GFX942-LABEL: test_mfma_nested_loop_zeroinit: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0 +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v1 ; GFX942-NEXT: s_mov_b32 s0, 0 -; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a8, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a9, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a10, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a11, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a12, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a13, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a14, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a15, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a16, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a17, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a18, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a19, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a20, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a21, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a22, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a23, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a24, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a25, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a26, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a27, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a28, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a29, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a30, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: .LBB9_1: ; %for.cond.preheader diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm-gfx942.mir b/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm-gfx942.mir new file mode 100644 index 0000000000000..a0721b06dc2cc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm-gfx942.mir @@ -0,0 +1,99 @@ +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=si-fold-operands -o - %s | FileCheck %s + +--- +name: v_mov_b64_pseudo_ignore +tracksRegLiveness: true +body: | + bb.0: + + ; CHECK-LABEL: name: v_mov_b64_pseudo_ignore + ; CHECK: [[MAT_B64:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1234, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[MAT_B64]], %subreg.sub0, [[MAT_B64]], %subreg.sub1 + ; CHECK-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] + %0:vreg_64_align2 = V_MOV_B64_PSEUDO 1234, implicit $exec + %1:vreg_64_align2 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1 + S_ENDPGM 0, implicit %1 +... + +--- +name: v_mov_b32_splatted_zero +tracksRegLiveness: true +body: | + bb.0: + + ; CHECK-LABEL: name: v_mov_b32_splatted_zero + ; CHECK: {{%[0-9]}}:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[MAT_B64:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[MAT_B64]] + %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1:vreg_64_align2 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1 + S_ENDPGM 0, implicit %1 +... + +--- +name: multi_v_mov_b32_fits +tracksRegLiveness: true +body: | + bb.0: + + ; CHECK-LABEL: name: multi_v_mov_b32_fits + ; CHECK: [[MAT_B32:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12345, implicit $exec + ; CHECK-NEXT: [[MAT_B32_ZERO:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[MAT_B64:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 12345, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[MAT_B64]] + %0:vgpr_32 = V_MOV_B32_e32 12345, implicit $exec + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %2:vreg_64_align2 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 + S_ENDPGM 0, implicit %2 +... + +--- +name: multi_v_mov_b32_no_fold +tracksRegLiveness: true +body: | + bb.0: + + ; CHECK-LABEL: name: multi_v_mov_b32_no_fold + ; CHECK: [[MAT_B32_ONE:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; CHECK-NEXT: [[MAT_B32:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12345, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[MAT_B32_ONE]], %subreg.sub0, [[MAT_B32]], %subreg.sub1 + ; CHECK-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] + %0:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + %1:vgpr_32 = V_MOV_B32_e32 12345, implicit $exec + %2:vreg_64_align2 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 + S_ENDPGM 0, implicit %2 +... + +--- +name: v_mov_b32_to_b128 +tracksRegLiveness: true +body: | + bb.0: + + ; CHECK-LABEL: name: v_mov_b32_to_b128 + ; CHECK: [[MAT_B32:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[MAT_B64_FIRST:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec + ; CHECK-NEXT: [[MAT_B64_SECOND:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[MAT_B64_FIRST]], %subreg.sub0_sub1, [[MAT_B64_SECOND]], %subreg.sub2_sub3 + %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1:vreg_128_align2 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1, %0, %subreg.sub2, %0, %subreg.sub3 + S_ENDPGM 0, implicit %1 +... + +--- +name: multi_v_mov_b32_to_b128 +tracksRegLiveness: true +body: | + bb.0: + + ; CHECK-LABEL: name: multi_v_mov_b32_to_b128 + ; CHECK: [[MAT_B32_FIRST:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 12345, implicit $exec + ; CHECK-NEXT: [[MAT_B32_SECOND:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[MAT_B64_FIRST:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 12345, implicit $exec + ; CHECK-NEXT: [[MAT_B64_SECOND:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 12345, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[MAT_B64_FIRST]], %subreg.sub0_sub1, [[MAT_B64_SECOND]], %subreg.sub2_sub3 + %0:vgpr_32 = V_MOV_B32_e32 12345, implicit $exec + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %2:vreg_128_align2 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %0, %subreg.sub2, %1, %subreg.sub3 + S_ENDPGM 0, implicit %2 +... diff --git a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll index 1e042d3b4a31f..69773bf265e8c 100644 --- a/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll +++ b/llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll @@ -6,25 +6,22 @@ define protected amdgpu_kernel void @test(ptr addrspace(1) %in, ptr addrspace(1) ; GFX942-LABEL: test: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v3, v0 -; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: v_mov_b64_e32 v[4:5], 0 +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX942-NEXT: v_mov_b32_e32 v10, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX942-NEXT: v_mov_b64_e32 v[10:11], v[2:3] -; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[0:1] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v12, s4 -; GFX942-NEXT: v_mov_b32_e32 v13, s5 -; GFX942-NEXT: v_mov_b32_e32 v4, s6 -; GFX942-NEXT: v_mov_b32_e32 v5, s7 -; GFX942-NEXT: v_mov_b32_e32 v6, s7 -; GFX942-NEXT: v_mov_b32_e32 v7, s7 +; GFX942-NEXT: v_mov_b32_e32 v8, s4 +; GFX942-NEXT: v_mov_b32_e32 v9, s5 +; GFX942-NEXT: v_mov_b32_e32 v0, s6 +; GFX942-NEXT: v_mov_b32_e32 v1, s7 +; GFX942-NEXT: v_mov_b32_e32 v2, s7 +; GFX942-NEXT: v_mov_b32_e32 v3, s7 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[12:13], v[4:7], v13 +; GFX942-NEXT: v_smfmac_i32_16x16x64_i8 v[4:7], v[8:9], v[0:3], v9 ; GFX942-NEXT: s_nop 6 -; GFX942-NEXT: global_store_dword v0, v11, s[2:3] offset:12 +; GFX942-NEXT: global_store_dword v10, v7, s[2:3] offset:12 ; GFX942-NEXT: s_endpgm entry: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 0 diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index d8264b5a091e1..776db1815a872 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -447,37 +447,36 @@ define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspa ; GFX942-LABEL: v8i8_phi_zeroinit: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0 -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v2 +; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v2 +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[8:9] +; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[8:9] ; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_cbranch_execz .LBB9_2 ; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dwordx2 v[4:5], v1, s[10:11] -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 -; GFX942-NEXT: s_waitcnt vmcnt(1) -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: global_load_dwordx2 v[4:5], v3, s[10:11] +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v2 ; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX942-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX942-NEXT: v_mov_b32_e32 v3, v2 +; GFX942-NEXT: s_waitcnt vmcnt(1) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0 ; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX942-NEXT: .LBB9_2: ; %Flow ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX942-NEXT: s_cbranch_execz .LBB9_4 ; GFX942-NEXT: ; %bb.3: ; %bb.2 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] -; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[12:13] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[12:13] ; GFX942-NEXT: .LBB9_4: ; %bb.3 ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: global_store_dwordx2 v0, v[4:5], s[14:15] ; GFX942-NEXT: s_endpgm entry: