Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -627,13 +627,21 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) {
TRI = ST.getRegisterInfo();
TII = ST.getInstrInfo();

// Instructions to re-legalize after changing register classes
SmallVector<MachineInstr *, 8> Relegalize;

for (MachineBasicBlock &MBB : MF) {
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
++I) {
MachineInstr &MI = *I;

switch (MI.getOpcode()) {
default:
// scale_src has a register class restricted to low 256 VGPRs, changing
// registers to VGPR may not take it into acount.
if (TII->isWMMA(MI) &&
AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::scale_src0))
Relegalize.push_back(&MI);
continue;
case AMDGPU::COPY: {
const TargetRegisterClass *SrcRC, *DstRC;
Expand Down Expand Up @@ -791,6 +799,9 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) {
for (auto *MI : PHINodes) {
processPHINode(*MI);
}
while (!Relegalize.empty())
TII->legalizeOperands(*Relegalize.pop_back_val(), MDT);

if (MF.getTarget().getOptLevel() > CodeGenOptLevel::None && EnableM0Merge)
hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII);

Expand Down
15 changes: 15 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6574,6 +6574,21 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
!RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
legalizeOpWithMove(MI, VOP3Idx[2]);

if (isWMMA(MI)) {
// scale_src has a register class restricted to low 256 VGPRs, we may need
// to insert a copy to the restricted VGPR class.
int ScaleSrc0Idx =
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::scale_src0);
if (ScaleSrc0Idx != -1) {
int ScaleSrc1Idx =
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::scale_src1);
if (!isOperandLegal(MI, ScaleSrc0Idx))
legalizeOpWithMove(MI, ScaleSrc0Idx);
if (!isOperandLegal(MI, ScaleSrc1Idx))
legalizeOpWithMove(MI, ScaleSrc1Idx);
}
}

// Fix the register class of packed FP32 instructions on gfx12+. See
// SIInstrInfo::isLegalGFX12PlusPackedMathFP32Operand for more information.
if (AMDGPU::isPackedFP32Inst(Opc) && AMDGPU::isGFX12Plus(ST)) {
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AMDGPU/VOP3PInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1518,8 +1518,8 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
dag MatrixFMT = !if(HasMatrixFMT, (ins MatrixAFMT:$matrix_a_fmt, MatrixBFMT:$matrix_b_fmt),
(ins));
dag MatrixScaleSrc = !if(HasMatrixScale,
!if(Scale16, (ins VCSrc_b64:$scale_src0, VCSrc_b64:$scale_src1),
(ins VCSrc_b32:$scale_src0, VCSrc_b32:$scale_src1)),
!if(Scale16, (ins VCSrc_b64_Lo256:$scale_src0, VCSrc_b64_Lo256:$scale_src1),
(ins VCSrc_b32_Lo256:$scale_src0, VCSrc_b32_Lo256:$scale_src1)),
(ins));
dag MatrixScale = !if(HasMatrixScale, (ins MatrixAScale:$matrix_a_scale, MatrixBScale:$matrix_b_scale,
MatrixAScaleFmt:$matrix_a_scale_fmt, MatrixBScaleFmt:$matrix_b_scale_fmt),
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -901,9 +901,9 @@ bb:
define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_si_scale(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 inreg %scale_src0, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_si_scale:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: s_movk_i32 s1, 0x64
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s0, s1 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
; GFX1250-NEXT: v_mov_b32_e32 v42, 0x64
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s0, v42 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off
Expand Down Expand Up @@ -1499,9 +1499,9 @@ bb:
define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_si_scale(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 inreg %scale_src0, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_si_scale:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: s_mov_b64 s[2:3], 0x64
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s[0:1], s[2:3] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
; GFX1250-NEXT: v_mov_b64_e32 v[42:43], 0x64
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s[0:1], v[42:43] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off
Expand Down Expand Up @@ -2291,9 +2291,9 @@ bb:
define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_si_scale(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i32 inreg %scale_src0, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_si_scale:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: s_movk_i32 s1, 0x64
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
; GFX1250-NEXT: v_mov_b32_e32 v42, 0x64
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, v42 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32
Expand Down Expand Up @@ -2373,9 +2373,9 @@ bb:
define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_si_scale(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i64 inreg %scale_src0, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_si_scale:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: s_mov_b64 s[2:3], 0x64
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
; GFX1250-NEXT: v_mov_b64_e32 v[42:43], 0x64
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32
Expand Down
34 changes: 16 additions & 18 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1512,14 +1512,13 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable(<16 x
; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000
; GFX1250-NEXT: s_movk_i32 s0, 0x65
; GFX1250-NEXT: s_movk_i32 s1, 0x64
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_mov_b32_e32 v43, 0x64
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v42, 0x65 :: v_dual_mov_b32 v41, v34
; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34
; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34
; GFX1250-NEXT: v_mov_b32_e32 v41, v34
; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], s1, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], v43, v42 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
Expand Down Expand Up @@ -1619,14 +1618,14 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable(<16
; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000
; GFX1250-NEXT: s_mov_b64 s[0:1], 0x65
; GFX1250-NEXT: s_mov_b64 s[2:3], 0x64
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_mov_b64_e32 v[42:43], 0x65
; GFX1250-NEXT: v_mov_b64_e32 v[44:45], 0x64
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34
; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34
; GFX1250-NEXT: v_mov_b32_e32 v41, v34
; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], s[2:3], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], v[44:45], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
Expand Down Expand Up @@ -2621,19 +2620,18 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_inlineable(<16 x i32
; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v26, 0x40400000
; GFX1250-NEXT: s_movk_i32 s0, 0x65
; GFX1250-NEXT: s_movk_i32 s1, 0x64
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_mov_b32_e32 v43, 0x64
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-NEXT: v_dual_mov_b32 v42, 0x65 :: v_dual_mov_b32 v41, v26
; GFX1250-NEXT: v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26
; GFX1250-NEXT: v_dual_mov_b32 v29, v26 :: v_dual_mov_b32 v30, v26
; GFX1250-NEXT: v_dual_mov_b32 v31, v26 :: v_dual_mov_b32 v32, v26
; GFX1250-NEXT: v_dual_mov_b32 v33, v26 :: v_dual_mov_b32 v34, v26
; GFX1250-NEXT: v_dual_mov_b32 v35, v26 :: v_dual_mov_b32 v36, v26
; GFX1250-NEXT: v_dual_mov_b32 v37, v26 :: v_dual_mov_b32 v38, v26
; GFX1250-NEXT: v_dual_mov_b32 v39, v26 :: v_dual_mov_b32 v40, v26
; GFX1250-NEXT: v_mov_b32_e32 v41, v26
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], s1, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], v43, v42 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
Expand Down Expand Up @@ -2774,9 +2772,9 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_inlineable(<16 x i
; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v26, 0x40400000
; GFX1250-NEXT: s_mov_b64 s[0:1], 0x65
; GFX1250-NEXT: s_mov_b64 s[2:3], 0x64
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_mov_b64_e32 v[42:43], 0x65
; GFX1250-NEXT: v_mov_b64_e32 v[44:45], 0x64
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1250-NEXT: v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26
; GFX1250-NEXT: v_dual_mov_b32 v29, v26 :: v_dual_mov_b32 v30, v26
; GFX1250-NEXT: v_dual_mov_b32 v31, v26 :: v_dual_mov_b32 v32, v26
Expand All @@ -2786,7 +2784,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_inlineable(<16 x i
; GFX1250-NEXT: v_dual_mov_b32 v39, v26 :: v_dual_mov_b32 v40, v26
; GFX1250-NEXT: v_mov_b32_e32 v41, v26
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], s[2:3], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], v[44:45], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
Expand Down