diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index dce4e6f993005..6533d4c8eca35 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -627,6 +627,9 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) { TRI = ST.getRegisterInfo(); TII = ST.getInstrInfo(); + // Instructions to re-legalize after changing register classes + SmallVector Relegalize; + for (MachineBasicBlock &MBB : MF) { for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I) { @@ -634,6 +637,11 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) { switch (MI.getOpcode()) { default: + // scale_src has a register class restricted to low 256 VGPRs, changing + // registers to VGPR may not take it into acount. + if (TII->isWMMA(MI) && + AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::scale_src0)) + Relegalize.push_back(&MI); continue; case AMDGPU::COPY: { const TargetRegisterClass *SrcRC, *DstRC; @@ -791,6 +799,9 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) { for (auto *MI : PHINodes) { processPHINode(*MI); } + while (!Relegalize.empty()) + TII->legalizeOperands(*Relegalize.pop_back_val(), MDT); + if (MF.getTarget().getOptLevel() > CodeGenOptLevel::None && EnableM0Merge) hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d500858841a41..5c958dfe6954f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6574,6 +6574,21 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg())) legalizeOpWithMove(MI, VOP3Idx[2]); + if (isWMMA(MI)) { + // scale_src has a register class restricted to low 256 VGPRs, we may need + // to insert a copy to the restricted VGPR class. + int ScaleSrc0Idx = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::scale_src0); + if (ScaleSrc0Idx != -1) { + int ScaleSrc1Idx = + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::scale_src1); + if (!isOperandLegal(MI, ScaleSrc0Idx)) + legalizeOpWithMove(MI, ScaleSrc0Idx); + if (!isOperandLegal(MI, ScaleSrc1Idx)) + legalizeOpWithMove(MI, ScaleSrc1Idx); + } + } + // Fix the register class of packed FP32 instructions on gfx12+. See // SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information. if (AMDGPU::isPackedFP32Inst(Opc) && AMDGPU::isGFX12Plus(ST)) { diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index cf0e502b2dab4..cd17382d8d308 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -1518,8 +1518,8 @@ class VOP3PWMMA_Profile ArgTy, bit _IsSWMMAC, int _IndexType, dag MatrixFMT = !if(HasMatrixFMT, (ins MatrixAFMT:$matrix_a_fmt, MatrixBFMT:$matrix_b_fmt), (ins)); dag MatrixScaleSrc = !if(HasMatrixScale, - !if(Scale16, (ins VCSrc_b64:$scale_src0, VCSrc_b64:$scale_src1), - (ins VCSrc_b32:$scale_src0, VCSrc_b32:$scale_src1)), + !if(Scale16, (ins VCSrc_b64_Lo256:$scale_src0, VCSrc_b64_Lo256:$scale_src1), + (ins VCSrc_b32_Lo256:$scale_src0, VCSrc_b32_Lo256:$scale_src1)), (ins)); dag MatrixScale = !if(HasMatrixScale, (ins MatrixAScale:$matrix_a_scale, MatrixBScale:$matrix_b_scale, MatrixAScaleFmt:$matrix_a_scale_fmt, MatrixBScaleFmt:$matrix_b_scale_fmt), diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll index 1bf865c414279..b6c930dacf9ab 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll @@ -901,9 +901,9 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_si_scale(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 inreg %scale_src0, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_si_scale: ; GFX1250: ; %bb.0: ; %bb -; GFX1250-NEXT: s_movk_i32 s1, 0x64 -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s0, s1 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse +; GFX1250-NEXT: v_mov_b32_e32 v42, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s0, v42 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 ; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -1499,9 +1499,9 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_si_scale(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 inreg %scale_src0, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_si_scale: ; GFX1250: ; %bb.0: ; %bb -; GFX1250-NEXT: s_mov_b64 s[2:3], 0x64 -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s[0:1], s[2:3] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse +; GFX1250-NEXT: v_mov_b64_e32 v[42:43], 0x64 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s[0:1], v[42:43] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 ; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off @@ -2291,9 +2291,9 @@ bb: define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_si_scale(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i32 inreg %scale_src0, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_si_scale: ; GFX1250: ; %bb.0: ; %bb -; GFX1250-NEXT: s_movk_i32 s1, 0x64 -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse +; GFX1250-NEXT: v_mov_b32_e32 v42, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, v42 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse ; GFX1250-NEXT: s_clause 0x3 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 ; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 @@ -2373,9 +2373,9 @@ bb: define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_si_scale(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i64 inreg %scale_src0, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_si_scale: ; GFX1250: ; %bb.0: ; %bb -; GFX1250-NEXT: s_mov_b64 s[2:3], 0x64 -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse +; GFX1250-NEXT: v_mov_b64_e32 v[42:43], 0x64 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse ; GFX1250-NEXT: s_clause 0x3 ; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 ; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll index 48303c004f1d0..dc477992db81e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll @@ -1512,14 +1512,13 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable(<16 x ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000 -; GFX1250-NEXT: s_movk_i32 s0, 0x65 -; GFX1250-NEXT: s_movk_i32 s1, 0x64 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mov_b32_e32 v43, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v42, 0x65 :: v_dual_mov_b32 v41, v34 ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 ; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34 ; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34 -; GFX1250-NEXT: v_mov_b32_e32 v41, v34 -; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], s1, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], v43, v42 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off @@ -1619,14 +1618,14 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable(<16 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000 -; GFX1250-NEXT: s_mov_b64 s[0:1], 0x65 -; GFX1250-NEXT: s_mov_b64 s[2:3], 0x64 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mov_b64_e32 v[42:43], 0x65 +; GFX1250-NEXT: v_mov_b64_e32 v[44:45], 0x64 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 ; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34 ; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34 ; GFX1250-NEXT: v_mov_b32_e32 v41, v34 -; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], s[2:3], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], v[44:45], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 ; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off @@ -2621,9 +2620,9 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_inlineable(<16 x i32 ; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_non_inlineable: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v26, 0x40400000 -; GFX1250-NEXT: s_movk_i32 s0, 0x65 -; GFX1250-NEXT: s_movk_i32 s1, 0x64 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_mov_b32_e32 v43, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-NEXT: v_dual_mov_b32 v42, 0x65 :: v_dual_mov_b32 v41, v26 ; GFX1250-NEXT: v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26 ; GFX1250-NEXT: v_dual_mov_b32 v29, v26 :: v_dual_mov_b32 v30, v26 ; GFX1250-NEXT: v_dual_mov_b32 v31, v26 :: v_dual_mov_b32 v32, v26 @@ -2631,9 +2630,8 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_inlineable(<16 x i32 ; GFX1250-NEXT: v_dual_mov_b32 v35, v26 :: v_dual_mov_b32 v36, v26 ; GFX1250-NEXT: v_dual_mov_b32 v37, v26 :: v_dual_mov_b32 v38, v26 ; GFX1250-NEXT: v_dual_mov_b32 v39, v26 :: v_dual_mov_b32 v40, v26 -; GFX1250-NEXT: v_mov_b32_e32 v41, v26 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], s1, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], v43, v42 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; GFX1250-NEXT: s_clause 0x3 ; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 ; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 @@ -2774,9 +2772,9 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_inlineable(<16 x i ; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_inlineable: ; GFX1250: ; %bb.0: ; %bb ; GFX1250-NEXT: v_mov_b32_e32 v26, 0x40400000 -; GFX1250-NEXT: s_mov_b64 s[0:1], 0x65 -; GFX1250-NEXT: s_mov_b64 s[2:3], 0x64 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_mov_b64_e32 v[42:43], 0x65 +; GFX1250-NEXT: v_mov_b64_e32 v[44:45], 0x64 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1250-NEXT: v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26 ; GFX1250-NEXT: v_dual_mov_b32 v29, v26 :: v_dual_mov_b32 v30, v26 ; GFX1250-NEXT: v_dual_mov_b32 v31, v26 :: v_dual_mov_b32 v32, v26 @@ -2786,7 +2784,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_inlineable(<16 x i ; GFX1250-NEXT: v_dual_mov_b32 v39, v26 :: v_dual_mov_b32 v40, v26 ; GFX1250-NEXT: v_mov_b32_e32 v41, v26 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], s[2:3], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], v[44:45], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; GFX1250-NEXT: s_clause 0x3 ; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 ; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32