-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU] Add gfx1250 runline to bf16.ll. NFC #160241
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Add gfx1250 runline to bf16.ll. NFC #160241
Conversation
Note that true16 version of it does not work failing to select a mere i16 load.
@llvm/pr-subscribers-backend-amdgpu Author: Stanislav Mekhanoshin (rampitec) ChangesNote that true16 version of it does not work failing to select Patch is 217.08 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/160241.diff 1 Files Affected:
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 44c719f3635c8..371e460d9638e 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -7,6 +7,10 @@
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11TRUE16
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11FAKE16
+; xUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX1250,GFX1250TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX1250,GFX1250FAKE16
+
+; FIXME: real-true16 version of gfx1250 test fails
define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; GCN-LABEL: test_load_store:
@@ -76,6 +80,15 @@ define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11FAKE16-NEXT: global_store_b16 v[2:3], v0, off
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_load_store:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u16 v0, v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b16 v[2:3], v0, off
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%val = load bfloat, ptr addrspace(1) %in
store bfloat %val, ptr addrspace(1) %out
ret void
@@ -135,6 +148,14 @@ define <2 x bfloat> @v_load_global_v2bf16(ptr addrspace(1) %ptr) {
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_global_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v0, v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%load = load <2 x bfloat>, ptr addrspace(1) %ptr
ret <2 x bfloat> %load
}
@@ -195,6 +216,14 @@ define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) {
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_global_v3bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%load = load <3 x bfloat>, ptr addrspace(1) %ptr
ret <3 x bfloat> %load
}
@@ -257,6 +286,14 @@ define <4 x bfloat> @v_load_global_v4bf16(ptr addrspace(1) %ptr) {
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_global_v4bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%load = load <4 x bfloat>, ptr addrspace(1) %ptr
ret <4 x bfloat> %load
}
@@ -323,6 +360,14 @@ define <6 x bfloat> @v_load_global_v6bf16(ptr addrspace(1) %ptr) {
; GFX11-NEXT: global_load_b96 v[0:2], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_global_v6bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%load = load <6 x bfloat>, ptr addrspace(1) %ptr
ret <6 x bfloat> %load
}
@@ -393,6 +438,14 @@ define <8 x bfloat> @v_load_global_v8bf16(ptr addrspace(1) %ptr) {
; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_global_v8bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%load = load <8 x bfloat>, ptr addrspace(1) %ptr
ret <8 x bfloat> %load
}
@@ -511,6 +564,17 @@ define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) {
; GFX11-NEXT: global_load_b128 v[4:7], v[4:5], off offset:16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_global_v16bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_load_b128 v[0:3], v[4:5], off
+; GFX1250-NEXT: global_load_b128 v[4:7], v[4:5], off offset:16
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%load = load <16 x bfloat>, ptr addrspace(1) %ptr
ret <16 x bfloat> %load
}
@@ -683,6 +747,19 @@ define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) {
; GFX11-NEXT: global_load_b128 v[12:15], v[12:13], off offset:48
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_global_v32bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v12, v0
+; GFX1250-NEXT: s_clause 0x3
+; GFX1250-NEXT: global_load_b128 v[0:3], v[12:13], off
+; GFX1250-NEXT: global_load_b128 v[4:7], v[12:13], off offset:16
+; GFX1250-NEXT: global_load_b128 v[8:11], v[12:13], off offset:32
+; GFX1250-NEXT: global_load_b128 v[12:15], v[12:13], off offset:48
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%load = load <32 x bfloat>, ptr addrspace(1) %ptr
ret <32 x bfloat> %load
}
@@ -973,6 +1050,23 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
; GFX11-NEXT: global_load_b128 v[28:31], v[28:29], off offset:112
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_global_v64bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v29, v1 :: v_dual_mov_b32 v28, v0
+; GFX1250-NEXT: s_clause 0x7
+; GFX1250-NEXT: global_load_b128 v[0:3], v[28:29], off
+; GFX1250-NEXT: global_load_b128 v[4:7], v[28:29], off offset:16
+; GFX1250-NEXT: global_load_b128 v[8:11], v[28:29], off offset:32
+; GFX1250-NEXT: global_load_b128 v[12:15], v[28:29], off offset:48
+; GFX1250-NEXT: global_load_b128 v[16:19], v[28:29], off offset:64
+; GFX1250-NEXT: global_load_b128 v[20:23], v[28:29], off offset:80
+; GFX1250-NEXT: global_load_b128 v[24:27], v[28:29], off offset:96
+; GFX1250-NEXT: global_load_b128 v[28:31], v[28:29], off offset:112
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%load = load <64 x bfloat>, ptr addrspace(1) %ptr
ret <64 x bfloat> %load
}
@@ -1042,6 +1136,14 @@ define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_store_b32 v[1:2], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_store_global_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: global_store_b32 v[2:3], v0, off
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
store <2 x bfloat> %val, ptr addrspace(1) %ptr
ret void
}
@@ -1115,6 +1217,15 @@ define void @v_store_global_v3bf16(<3 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX11-NEXT: global_store_b16 v[2:3], v1, off offset:4
; GFX11-NEXT: global_store_b32 v[2:3], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_store_global_v3bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b16 v[2:3], v1, off offset:4
+; GFX1250-NEXT: global_store_b32 v[2:3], v0, off
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
store <3 x bfloat> %val, ptr addrspace(1) %ptr
ret void
}
@@ -1183,6 +1294,13 @@ define void @v_store_global_v4bf16(<4 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_store_global_v4bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
store <4 x bfloat> %val, ptr addrspace(1) %ptr
ret void
}
@@ -1267,6 +1385,13 @@ define void @v_store_global_v8bf16(<8 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_store_global_v8bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_store_b128 v[4:5], v[0:3], off
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
store <8 x bfloat> %val, ptr addrspace(1) %ptr
ret void
}
@@ -1393,6 +1518,15 @@ define void @v_store_global_v16bf16(<16 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX11-NEXT: global_store_b128 v[8:9], v[4:7], off offset:16
; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_store_global_v16bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[8:9], v[4:7], off offset:16
+; GFX1250-NEXT: global_store_b128 v[8:9], v[0:3], off
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
store <16 x bfloat> %val, ptr addrspace(1) %ptr
ret void
}
@@ -1610,6 +1744,17 @@ define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX11-NEXT: global_store_b128 v[16:17], v[4:7], off offset:16
; GFX11-NEXT: global_store_b128 v[16:17], v[0:3], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_store_global_v32bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_clause 0x3
+; GFX1250-NEXT: global_store_b128 v[16:17], v[12:15], off offset:48
+; GFX1250-NEXT: global_store_b128 v[16:17], v[8:11], off offset:32
+; GFX1250-NEXT: global_store_b128 v[16:17], v[4:7], off offset:16
+; GFX1250-NEXT: global_store_b128 v[16:17], v[0:3], off
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
store <32 x bfloat> %val, ptr addrspace(1) %ptr
ret void
}
@@ -2148,6 +2293,26 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
; GFX11-NEXT: global_store_b128 v[32:33], v[4:7], off offset:16
; GFX11-NEXT: global_store_b128 v[32:33], v[0:3], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_store_global_v64bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_clause 0x2
+; GFX1250-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX1250-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX1250-NEXT: scratch_load_b32 v31, off, s32
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_clause 0x7
+; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:112
+; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off offset:96
+; GFX1250-NEXT: global_store_b128 v[32:33], v[20:23], off offset:80
+; GFX1250-NEXT: global_store_b128 v[32:33], v[16:19], off offset:64
+; GFX1250-NEXT: global_store_b128 v[32:33], v[12:15], off offset:48
+; GFX1250-NEXT: global_store_b128 v[32:33], v[8:11], off offset:32
+; GFX1250-NEXT: global_store_b128 v[32:33], v[4:7], off offset:16
+; GFX1250-NEXT: global_store_b128 v[32:33], v[0:3], off
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
store <64 x bfloat> %val, ptr addrspace(1) %ptr
ret void
}
@@ -2227,6 +2392,16 @@ define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) {
; GFX11FAKE16-NEXT: global_store_b16 v[0:1], v4, off
; GFX11FAKE16-NEXT: global_store_b16 v[2:3], v5, off
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_store_fpimm:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v4, 0x3f80
+; GFX1250-NEXT: v_mov_b32_e32 v5, 0x4228
+; GFX1250-NEXT: global_store_b16 v[0:1], v4, off
+; GFX1250-NEXT: global_store_b16 v[2:3], v5, off
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
store bfloat 1.0, ptr addrspace(1) %ptr0
store bfloat 42.0, ptr addrspace(1) %ptr1
ret void
@@ -2330,6 +2505,16 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo
; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_load_store_f32_to_bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v0, v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT: global_store_b16 v[2:3], v0, off
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%val = load float, ptr addrspace(1) %in
%val.bf16 = fptrunc float %val to bfloat
store bfloat %val.bf16, ptr addrspace(1) %out
@@ -2488,6 +2673,29 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc_lo
; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_load_store_f64_to_bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_cvt_f32_f64_e32 v6, v[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
+; GFX1250-NEXT: v_cmp_gt_f64_e64 s0, |v[0:1]|, |v[4:5]|
+; GFX1250-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, -1, 1, s0
+; GFX1250-NEXT: v_dual_add_nc_u32 v0, v6, v0 :: v_dual_bitop2_b32 v7, 1, v6 bitop3:0x40
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_cmp_eq_u32_e64 s0, 1, v7
+; GFX1250-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
+; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT: global_store_b16 v[2:3], v0, off
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%val = load double, ptr addrspace(1) %in
%val.bf16 = fptrunc double %val to bfloat
store bfloat %val.bf16, ptr addrspace(1) %out
@@ -2560,6 +2768,16 @@ define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: global_store_b32 v[2:3], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_load_store_bf16_to_f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u16 v0, v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: global_store_b32 v[2:3], v0, off
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%val = load bfloat, ptr addrspace(1) %in
%val.f32 = fpext bfloat %val to float
store float %val.f32, ptr addrspace(1) %out
@@ -2639,6 +2857,18 @@ define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1)
; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_load_store_bf16_to_f64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u16 v0, v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%val = load bfloat, ptr addrspace(1) %in
%val.f64 = fpext bfloat %val to double
store double %val.f64, ptr addrspace(1) %out
@@ -2705,6 +2935,15 @@ define void @test_load_store_v2bf16(ptr addrspace(1) %in, ptr addrspace(1) %out)
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v[2:3], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_load_store_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v0, v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v[2:3], v0, off
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%val = load <2 x bfloat>, ptr addrspace(1) %in
store <2 x bfloat> %val, ptr addrspace(1) %out
ret void
@@ -2770,6 +3009,15 @@ define void @test_load_store_v4bf16(ptr addrspace(1) %in, ptr addrspace(1) %out)
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_load_store_v4bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%val = load <4 x bfloat>, ptr addrspace(1) %in
store <4 x bfloat> %val, ptr addrspace(1) %out
ret void
@@ -2835,6 +3083,15 @@ define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out)
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_load_store_v8bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b128 v[4:7], v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%val = load <8 x bfloat>, ptr addrspace(1) %in
store <8 x bfloat> %val, ptr addrspace(1) %out
ret void
@@ -2924,6 +3181,19 @@ define void @test_load_store_v16bf16(ptr addrspace(1) %in, ptr addrspace(1) %out
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b128 v[2:3], v[8:11], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_load_store_v16bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16
+; GFX1250-NEXT: global_load_b128 v[8:11], v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x1
+; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off offset:16
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b128 v[2:3], v[8:11], off
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%val = load <16 x bfloat>, ptr add...
[truncated]
|
It has forced SramECC, so does not preserve the other half of a register. So we trivially have no load patterns for t16. |
Note that true16 version of it does not work failing to select
a mere i16 load.