-
Notifications
You must be signed in to change notification settings - Fork 15.2k
AMDGPU: Add tests with forced selection of mfmas to VGPR form #150626
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Add tests with forced selection of mfmas to VGPR form #150626
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesAdd some run lines to existing tests with VGPR selection enabled. Patch is 105.62 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/150626.diff 2 Files Affected:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index 780c7e93f80d0..adbc2df4b3474 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefixes=VGPR,GFX90A-VGPR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefixes=VGPR,GFX942-VGPR %s
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32)
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
@@ -127,6 +129,122 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[34:35]
; GFX942-NEXT: global_store_dwordx4 v1, a[4:7], s[34:35] offset:16
; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f32_32x32x4bf16_1k:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v33, 0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v34, 1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v35, v33
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v32, 2
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX90A-VGPR-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, s16
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, s17
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, s18
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, s19
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, s20
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, s21
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, s22
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, s23
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, s24
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v9, s25
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s26
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s27
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v12, s28
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v13, s29
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v14, s30
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v15, s31
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v16, s0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v17, s1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v18, s2
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v19, s3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v20, s4
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v21, s5
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v22, s6
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v23, s7
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v24, s8
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v25, s9
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v26, s10
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v27, s11
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v28, s12
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v29, s13
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v30, s14
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v31, s15
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f32_32x32x4bf16_1k v[0:31], v[34:35], v[32:33], v[0:31] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 2
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[24:27], s[34:35] offset:96
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[28:31], s[34:35] offset:112
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[16:19], s[34:35] offset:64
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[20:23], s[34:35] offset:80
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[8:11], s[34:35] offset:32
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[12:15], s[34:35] offset:48
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[0:3], s[34:35]
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[4:7], s[34:35] offset:16
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x4bf16_1k:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v33, 0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, 1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v35, v33
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 2
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, s16
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, s17
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, s18
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, s19
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, s20
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, s21
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s22
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s23
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s25
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s26
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s27
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, s28
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, s29
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, s30
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, s31
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, s2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, s3
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v20, s4
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v21, s5
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v22, s6
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v23, s7
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v24, s8
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v25, s9
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v26, s10
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v27, s11
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v28, s12
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v29, s13
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, s14
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, s15
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x4_2b_bf16 v[0:31], v[34:35], v[32:33], v[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 2
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[24:27], s[34:35] offset:96
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[28:31], s[34:35] offset:112
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[16:19], s[34:35] offset:64
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[20:23], s[34:35] offset:80
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[8:11], s[34:35] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[12:15], s[34:35] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[0:3], s[34:35]
+; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[4:7], s[34:35] offset:16
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%a = bitcast i64 1 to <4 x i16>
@@ -208,6 +326,62 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
; GFX942-NEXT: global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[16:17]
; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x4bf16_1k:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v17, 0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v18, 1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v19, v17
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v16, 2
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f32_16x16x4bf16_1k v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 2
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[0:3], s[16:17]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_16x16x4bf16_1k:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, 1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, v17
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 2
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_16x16x4_4b_bf16 v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 2
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%a = bitcast i64 1 to <4 x i16>
@@ -257,6 +431,42 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0
; GFX942-NEXT: s_nop 4
; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, 1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v5
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 2
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f32_4x4x4bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: s_nop 4
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v5
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 2
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_4x4x4_16b_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 4
+; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%a = bitcast i64 1 to <4 x i16>
@@ -339,6 +549,63 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
; GFX942-NEXT: global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[16:17]
; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f32_32x32x8bf16_1k:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v17, 0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v18, 1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v19, v17
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v16, 2
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f32_32x32x8bf16_1k v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 2
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[0:3], s[16:17]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x8bf16_1k:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, 1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, v17
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 2
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_bf16 v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: s_nop 2
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%a = bitcast i64 1 to <4 x i16>
@@ -389,6 +656,43 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
; GFX942-NEXT: s_nop 6
; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, 1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, v5
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 2
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f32_16x16x16bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: s_nop 2
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v5
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 2
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f32_16x16x16_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT: s_nop 6
+; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%a = bitcast i64 1 to <4 x i16>
@@ -430,6 +734,38 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: global_store_dwordx2 v0, a[0:1], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_4x4x4f64:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[4:5], v[0:1], v[2:3], 0
+; GFX90A-VGPR-NEXT: s_nop 3
+; GFX90A-VGPR-NEXT: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[4:5] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-VGPR-NEXT: s_nop 7
+; GFX90A-VGPR-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX90A-VGPR-NEXT: s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_4x4x4f64:
+; GFX942-VGPR: ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[4:5], v[0:1], v[2:3], 0
+; GFX942-VGPR-NEXT: s_nop 3
+; GFX942-VGPR-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[4:5] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-VGPR-NEXT: s_nop 7
+; GFX942-VGPR-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-VGPR-NEXT: s_endpgm
bb:
%mai.1 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double 0.0, i32 0, i32 0, i32 0)
%mai.2 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double %mai.1, i32 1, i32 2, i32 3)
@@ -493,6 +829,54 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX942-NEX...
[truncated]
|
Merge activity
|
41c6bc3
to
4328b06
Compare
29195b5
to
dd31101
Compare
It seems like graphite can no longer directly bypass the checks after the confirmation checkbox is added. It keeps rebasing this branch. |
b81d061
to
86c6961
Compare
Add some run lines to existing tests with VGPR selection enabled.
86c6961
to
5810f83
Compare
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/162/builds/27616 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/65/builds/20214 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/16/builds/23303 Here is the relevant piece of the build log for the reference
|
…50626) Add some run lines to existing tests with VGPR selection enabled.
Add some run lines to existing tests with VGPR selection enabled.