Skip to content

Conversation

rampitec
Copy link
Collaborator

Note that true16 version of it does not work failing to select
a mere i16 load.

Note that true16 version of it does not work failing to select
a mere i16 load.
Copy link
Collaborator Author

This stack of pull requests is managed by Graphite. Learn more about stacking.

@rampitec rampitec marked this pull request as ready for review September 23, 2025 06:37
@llvmbot
Copy link
Member

llvmbot commented Sep 23, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Stanislav Mekhanoshin (rampitec)

Changes

Note that true16 version of it does not work failing to select
a mere i16 load.


Patch is 217.08 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/160241.diff

1 Files Affected:

  • (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+3238)
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 44c719f3635c8..371e460d9638e 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -7,6 +7,10 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11TRUE16
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11FAKE16
+; xUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX1250,GFX1250TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX1250,GFX1250FAKE16
+
+; FIXME: real-true16 version of gfx1250 test fails
 
 define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_load_store:
@@ -76,6 +80,15 @@ define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11FAKE16-NEXT:    global_store_b16 v[2:3], v0, off
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_load_store:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %val = load bfloat, ptr addrspace(1) %in
   store bfloat %val, ptr addrspace(1) %out
   ret void
@@ -135,6 +148,14 @@ define <2 x bfloat> @v_load_global_v2bf16(ptr addrspace(1) %ptr) {
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_global_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %load = load <2 x bfloat>, ptr addrspace(1) %ptr
   ret <2 x bfloat> %load
 }
@@ -195,6 +216,14 @@ define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) {
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_global_v3bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %load = load <3 x bfloat>, ptr addrspace(1) %ptr
   ret <3 x bfloat> %load
 }
@@ -257,6 +286,14 @@ define <4 x bfloat> @v_load_global_v4bf16(ptr addrspace(1) %ptr) {
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_global_v4bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %load = load <4 x bfloat>, ptr addrspace(1) %ptr
   ret <4 x bfloat> %load
 }
@@ -323,6 +360,14 @@ define <6 x bfloat> @v_load_global_v6bf16(ptr addrspace(1) %ptr) {
 ; GFX11-NEXT:    global_load_b96 v[0:2], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_global_v6bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b96 v[0:2], v[0:1], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %load = load <6 x bfloat>, ptr addrspace(1) %ptr
   ret <6 x bfloat> %load
 }
@@ -393,6 +438,14 @@ define <8 x bfloat> @v_load_global_v8bf16(ptr addrspace(1) %ptr) {
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_global_v8bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %load = load <8 x bfloat>, ptr addrspace(1) %ptr
   ret <8 x bfloat> %load
 }
@@ -511,6 +564,17 @@ define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) {
 ; GFX11-NEXT:    global_load_b128 v[4:7], v[4:5], off offset:16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_global_v16bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_load_b128 v[0:3], v[4:5], off
+; GFX1250-NEXT:    global_load_b128 v[4:7], v[4:5], off offset:16
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %load = load <16 x bfloat>, ptr addrspace(1) %ptr
   ret <16 x bfloat> %load
 }
@@ -683,6 +747,19 @@ define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) {
 ; GFX11-NEXT:    global_load_b128 v[12:15], v[12:13], off offset:48
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_global_v32bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v12, v0
+; GFX1250-NEXT:    s_clause 0x3
+; GFX1250-NEXT:    global_load_b128 v[0:3], v[12:13], off
+; GFX1250-NEXT:    global_load_b128 v[4:7], v[12:13], off offset:16
+; GFX1250-NEXT:    global_load_b128 v[8:11], v[12:13], off offset:32
+; GFX1250-NEXT:    global_load_b128 v[12:15], v[12:13], off offset:48
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %load = load <32 x bfloat>, ptr addrspace(1) %ptr
   ret <32 x bfloat> %load
 }
@@ -973,6 +1050,23 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
 ; GFX11-NEXT:    global_load_b128 v[28:31], v[28:29], off offset:112
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_global_v64bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v29, v1 :: v_dual_mov_b32 v28, v0
+; GFX1250-NEXT:    s_clause 0x7
+; GFX1250-NEXT:    global_load_b128 v[0:3], v[28:29], off
+; GFX1250-NEXT:    global_load_b128 v[4:7], v[28:29], off offset:16
+; GFX1250-NEXT:    global_load_b128 v[8:11], v[28:29], off offset:32
+; GFX1250-NEXT:    global_load_b128 v[12:15], v[28:29], off offset:48
+; GFX1250-NEXT:    global_load_b128 v[16:19], v[28:29], off offset:64
+; GFX1250-NEXT:    global_load_b128 v[20:23], v[28:29], off offset:80
+; GFX1250-NEXT:    global_load_b128 v[24:27], v[28:29], off offset:96
+; GFX1250-NEXT:    global_load_b128 v[28:31], v[28:29], off offset:112
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %load = load <64 x bfloat>, ptr addrspace(1) %ptr
   ret <64 x bfloat> %load
 }
@@ -1042,6 +1136,14 @@ define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v[1:2], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_store_global_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT:    global_store_b32 v[2:3], v0, off
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   store <2 x bfloat> %val, ptr addrspace(1) %ptr
   ret void
 }
@@ -1115,6 +1217,15 @@ define void @v_store_global_v3bf16(<3 x bfloat> %val, ptr addrspace(1) %ptr) {
 ; GFX11-NEXT:    global_store_b16 v[2:3], v1, off offset:4
 ; GFX11-NEXT:    global_store_b32 v[2:3], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_store_global_v3bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b16 v[2:3], v1, off offset:4
+; GFX1250-NEXT:    global_store_b32 v[2:3], v0, off
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   store <3 x bfloat> %val, ptr addrspace(1) %ptr
   ret void
 }
@@ -1183,6 +1294,13 @@ define void @v_store_global_v4bf16(<4 x bfloat> %val, ptr addrspace(1) %ptr) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_store_global_v4bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   store <4 x bfloat> %val, ptr addrspace(1) %ptr
   ret void
 }
@@ -1267,6 +1385,13 @@ define void @v_store_global_v8bf16(<8 x bfloat> %val, ptr addrspace(1) %ptr) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_store_global_v8bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b128 v[4:5], v[0:3], off
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   store <8 x bfloat> %val, ptr addrspace(1) %ptr
   ret void
 }
@@ -1393,6 +1518,15 @@ define void @v_store_global_v16bf16(<16 x bfloat> %val, ptr addrspace(1) %ptr) {
 ; GFX11-NEXT:    global_store_b128 v[8:9], v[4:7], off offset:16
 ; GFX11-NEXT:    global_store_b128 v[8:9], v[0:3], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_store_global_v16bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[8:9], v[4:7], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[8:9], v[0:3], off
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   store <16 x bfloat> %val, ptr addrspace(1) %ptr
   ret void
 }
@@ -1610,6 +1744,17 @@ define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) {
 ; GFX11-NEXT:    global_store_b128 v[16:17], v[4:7], off offset:16
 ; GFX11-NEXT:    global_store_b128 v[16:17], v[0:3], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_store_global_v32bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_clause 0x3
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:48
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[8:11], off offset:32
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[4:7], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[16:17], v[0:3], off
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   store <32 x bfloat> %val, ptr addrspace(1) %ptr
   ret void
 }
@@ -2148,6 +2293,26 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
 ; GFX11-NEXT:    global_store_b128 v[32:33], v[4:7], off offset:16
 ; GFX11-NEXT:    global_store_b128 v[32:33], v[0:3], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_store_global_v64bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_clause 0x2
+; GFX1250-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX1250-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX1250-NEXT:    scratch_load_b32 v31, off, s32
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_clause 0x7
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[28:31], off offset:112
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[24:27], off offset:96
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[20:23], off offset:80
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[16:19], off offset:64
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[12:15], off offset:48
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[8:11], off offset:32
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[4:7], off offset:16
+; GFX1250-NEXT:    global_store_b128 v[32:33], v[0:3], off
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   store <64 x bfloat> %val, ptr addrspace(1) %ptr
   ret void
 }
@@ -2227,6 +2392,16 @@ define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) {
 ; GFX11FAKE16-NEXT:    global_store_b16 v[0:1], v4, off
 ; GFX11FAKE16-NEXT:    global_store_b16 v[2:3], v5, off
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_store_fpimm:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GFX1250-NEXT:    v_mov_b32_e32 v5, 0x4228
+; GFX1250-NEXT:    global_store_b16 v[0:1], v4, off
+; GFX1250-NEXT:    global_store_b16 v[2:3], v5, off
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   store bfloat 1.0, ptr addrspace(1) %ptr0
   store bfloat 42.0, ptr addrspace(1) %ptr1
   ret void
@@ -2330,6 +2505,16 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc_lo
 ; GFX11-NEXT:    global_store_d16_hi_b16 v[2:3], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_load_store_f32_to_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %val = load float, ptr addrspace(1) %in
   %val.bf16 = fptrunc float %val to bfloat
   store bfloat %val.bf16, ptr addrspace(1) %out
@@ -2488,6 +2673,29 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc_lo
 ; GFX11-NEXT:    global_store_d16_hi_b16 v[2:3], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_load_store_f64_to_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_cvt_f32_f64_e32 v6, v[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; GFX1250-NEXT:    v_cmp_gt_f64_e64 s0, |v[0:1]|, |v[4:5]|
+; GFX1250-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s0
+; GFX1250-NEXT:    v_dual_add_nc_u32 v0, v6, v0 :: v_dual_bitop2_b32 v7, 1, v6 bitop3:0x40
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_cmp_eq_u32_e64 s0, 1, v7
+; GFX1250-NEXT:    s_or_b32 vcc_lo, vcc_lo, s0
+; GFX1250-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %val = load double, ptr addrspace(1) %in
   %val.bf16 = fptrunc double %val to bfloat
   store bfloat %val.bf16, ptr addrspace(1) %out
@@ -2560,6 +2768,16 @@ define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    global_store_b32 v[2:3], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_load_store_bf16_to_f32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    global_store_b32 v[2:3], v0, off
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %val = load bfloat, ptr addrspace(1) %in
   %val.f32 = fpext bfloat %val to float
   store float %val.f32, ptr addrspace(1) %out
@@ -2639,6 +2857,18 @@ define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_load_store_bf16_to_f64:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX1250-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %val = load bfloat, ptr addrspace(1) %in
   %val.f64 = fpext bfloat %val to double
   store double %val.f64, ptr addrspace(1) %out
@@ -2705,6 +2935,15 @@ define void @test_load_store_v2bf16(ptr addrspace(1) %in, ptr addrspace(1) %out)
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v[2:3], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_load_store_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v[2:3], v0, off
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %val = load <2 x bfloat>, ptr addrspace(1) %in
   store <2 x bfloat> %val, ptr addrspace(1) %out
   ret void
@@ -2770,6 +3009,15 @@ define void @test_load_store_v4bf16(ptr addrspace(1) %in, ptr addrspace(1) %out)
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_load_store_v4bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %val = load <4 x bfloat>, ptr addrspace(1) %in
   store <4 x bfloat> %val, ptr addrspace(1) %out
   ret void
@@ -2835,6 +3083,15 @@ define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out)
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v[2:3], v[4:7], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_load_store_v8bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %val = load <8 x bfloat>, ptr addrspace(1) %in
   store <8 x bfloat> %val, ptr addrspace(1) %out
   ret void
@@ -2924,6 +3181,19 @@ define void @test_load_store_v16bf16(ptr addrspace(1) %in, ptr addrspace(1) %out
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v[2:3], v[8:11], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_load_store_v16bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:16
+; GFX1250-NEXT:    global_load_b128 v[8:11], v[0:1], off
+; GFX1250-NEXT:    s_wait_loadcnt 0x1
+; GFX1250-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:16
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b128 v[2:3], v[8:11], off
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %val = load <16 x bfloat>, ptr add...
[truncated]

@rampitec
Copy link
Collaborator Author

Note that true16 version of it does not work failing to select a mere i16 load.

It has forced SramECC, so does not preserve the other half of a register. So we trivially have no load patterns for t16.

@rampitec rampitec merged commit d5b97aa into main Sep 23, 2025
13 checks passed
@rampitec rampitec deleted the users/rampitec/09-22-_amdgpu_add_gfx1250_runline_to_bf16.ll._nfc branch September 23, 2025 07:37
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants