-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU] Restrict scale operands of WMMA to low 256 VGPRs #157526
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Restrict scale operands of WMMA to low 256 VGPRs #157526
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Stanislav Mekhanoshin (rampitec) ChangesThese cannot accept high registers. Full diff: https://github.com/llvm/llvm-project/pull/157526.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index cf0e502b2dab4..cd17382d8d308 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1518,8 +1518,8 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
dag MatrixFMT = !if(HasMatrixFMT, (ins MatrixAFMT:$matrix_a_fmt, MatrixBFMT:$matrix_b_fmt),
(ins));
dag MatrixScaleSrc = !if(HasMatrixScale,
- !if(Scale16, (ins VCSrc_b64:$scale_src0, VCSrc_b64:$scale_src1),
- (ins VCSrc_b32:$scale_src0, VCSrc_b32:$scale_src1)),
+ !if(Scale16, (ins VCSrc_b64_Lo256:$scale_src0, VCSrc_b64_Lo256:$scale_src1),
+ (ins VCSrc_b32_Lo256:$scale_src0, VCSrc_b32_Lo256:$scale_src1)),
(ins));
dag MatrixScale = !if(HasMatrixScale, (ins MatrixAScale:$matrix_a_scale, MatrixBScale:$matrix_b_scale,
MatrixAScaleFmt:$matrix_a_scale_fmt, MatrixBScaleFmt:$matrix_b_scale_fmt),
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
index 1bf865c414279..b6c930dacf9ab 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
@@ -901,9 +901,9 @@ bb:
define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_si_scale(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 inreg %scale_src0, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_si_scale:
; GFX1250: ; %bb.0: ; %bb
-; GFX1250-NEXT: s_movk_i32 s1, 0x64
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s0, s1 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
+; GFX1250-NEXT: v_mov_b32_e32 v42, 0x64
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s0, v42 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off
@@ -1499,9 +1499,9 @@ bb:
define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_si_scale(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 inreg %scale_src0, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_si_scale:
; GFX1250: ; %bb.0: ; %bb
-; GFX1250-NEXT: s_mov_b64 s[2:3], 0x64
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s[0:1], s[2:3] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
+; GFX1250-NEXT: v_mov_b64_e32 v[42:43], 0x64
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s[0:1], v[42:43] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off
@@ -2291,9 +2291,9 @@ bb:
define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_si_scale(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i32 inreg %scale_src0, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_si_scale:
; GFX1250: ; %bb.0: ; %bb
-; GFX1250-NEXT: s_movk_i32 s1, 0x64
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
+; GFX1250-NEXT: v_mov_b32_e32 v42, 0x64
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, v42 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32
@@ -2373,9 +2373,9 @@ bb:
define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_si_scale(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i64 inreg %scale_src0, ptr addrspace(1) %out) {
; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_si_scale:
; GFX1250: ; %bb.0: ; %bb
-; GFX1250-NEXT: s_mov_b64 s[2:3], 0x64
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
+; GFX1250-NEXT: v_mov_b64_e32 v[42:43], 0x64
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48
; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll
index 48303c004f1d0..dc477992db81e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll
@@ -1512,14 +1512,13 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable(<16 x
; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000
-; GFX1250-NEXT: s_movk_i32 s0, 0x65
-; GFX1250-NEXT: s_movk_i32 s1, 0x64
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_e32 v43, 0x64
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_mov_b32 v42, 0x65 :: v_dual_mov_b32 v41, v34
; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34
; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34
-; GFX1250-NEXT: v_mov_b32_e32 v41, v34
-; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], s1, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
+; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], v43, v42 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
@@ -1619,14 +1618,14 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable(<16
; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000
-; GFX1250-NEXT: s_mov_b64 s[0:1], 0x65
-; GFX1250-NEXT: s_mov_b64 s[2:3], 0x64
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b64_e32 v[42:43], 0x65
+; GFX1250-NEXT: v_mov_b64_e32 v[44:45], 0x64
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34
; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34
; GFX1250-NEXT: v_mov_b32_e32 v41, v34
-; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], s[2:3], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
+; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], v[44:45], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16
; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off
@@ -2621,9 +2620,9 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_inlineable(<16 x i32
; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v26, 0x40400000
-; GFX1250-NEXT: s_movk_i32 s0, 0x65
-; GFX1250-NEXT: s_movk_i32 s1, 0x64
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_e32 v43, 0x64
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT: v_dual_mov_b32 v42, 0x65 :: v_dual_mov_b32 v41, v26
; GFX1250-NEXT: v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26
; GFX1250-NEXT: v_dual_mov_b32 v29, v26 :: v_dual_mov_b32 v30, v26
; GFX1250-NEXT: v_dual_mov_b32 v31, v26 :: v_dual_mov_b32 v32, v26
@@ -2631,9 +2630,8 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_inlineable(<16 x i32
; GFX1250-NEXT: v_dual_mov_b32 v35, v26 :: v_dual_mov_b32 v36, v26
; GFX1250-NEXT: v_dual_mov_b32 v37, v26 :: v_dual_mov_b32 v38, v26
; GFX1250-NEXT: v_dual_mov_b32 v39, v26 :: v_dual_mov_b32 v40, v26
-; GFX1250-NEXT: v_mov_b32_e32 v41, v26
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], s1, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
+; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], v43, v42 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
@@ -2774,9 +2772,9 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_inlineable(<16 x i
; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_inlineable:
; GFX1250: ; %bb.0: ; %bb
; GFX1250-NEXT: v_mov_b32_e32 v26, 0x40400000
-; GFX1250-NEXT: s_mov_b64 s[0:1], 0x65
-; GFX1250-NEXT: s_mov_b64 s[2:3], 0x64
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b64_e32 v[42:43], 0x65
+; GFX1250-NEXT: v_mov_b64_e32 v[44:45], 0x64
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1250-NEXT: v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26
; GFX1250-NEXT: v_dual_mov_b32 v29, v26 :: v_dual_mov_b32 v30, v26
; GFX1250-NEXT: v_dual_mov_b32 v31, v26 :: v_dual_mov_b32 v32, v26
@@ -2786,7 +2784,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_inlineable(<16 x i
; GFX1250-NEXT: v_dual_mov_b32 v39, v26 :: v_dual_mov_b32 v40, v26
; GFX1250-NEXT: v_mov_b32_e32 v41, v26
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], s[2:3], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
+; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], v[44:45], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
; GFX1250-NEXT: s_clause 0x3
; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48
; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32
|
These cannot accept high registers.
beffcb5
to
ca0f7da
Compare
…operands_of_wmma_to_low_256_vgprs
…operands_of_wmma_to_low_256_vgprs
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/88/builds/15864 Here is the relevant piece of the build log for the reference
|
These cannot accept high registers.