diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 44c719f3635c8..371e460d9638e 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -7,6 +7,10 @@ ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11TRUE16 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11FAKE16 +; xUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX1250,GFX1250TRUE16 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX1250,GFX1250FAKE16 + +; FIXME: real-true16 version of gfx1250 test fails define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GCN-LABEL: test_load_store: @@ -76,6 +80,15 @@ define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11FAKE16-NEXT: global_store_b16 v[2:3], v0, off ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_load_store: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_u16 v0, v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b16 v[2:3], v0, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %val = load bfloat, ptr addrspace(1) %in store bfloat %val, ptr addrspace(1) %out ret void @@ -135,6 +148,14 @@ define <2 x bfloat> @v_load_global_v2bf16(ptr addrspace(1) %ptr) { ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_load_global_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v0, v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <2 x bfloat>, ptr addrspace(1) %ptr ret <2 x bfloat> %load } @@ -195,6 +216,14 @@ define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) { ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_load_global_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <3 x bfloat>, ptr addrspace(1) %ptr ret <3 x bfloat> %load } @@ -257,6 +286,14 @@ define <4 x bfloat> @v_load_global_v4bf16(ptr addrspace(1) %ptr) { ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_load_global_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <4 x bfloat>, ptr addrspace(1) %ptr ret <4 x bfloat> %load } @@ -323,6 +360,14 @@ define <6 x bfloat> @v_load_global_v6bf16(ptr addrspace(1) %ptr) { ; GFX11-NEXT: global_load_b96 v[0:2], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_load_global_v6bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <6 x bfloat>, ptr addrspace(1) %ptr ret <6 x bfloat> %load } @@ -393,6 +438,14 @@ define <8 x bfloat> @v_load_global_v8bf16(ptr addrspace(1) %ptr) { ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_load_global_v8bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <8 x bfloat>, ptr addrspace(1) %ptr ret <8 x bfloat> %load } @@ -511,6 +564,17 @@ define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) { ; GFX11-NEXT: global_load_b128 v[4:7], v[4:5], off offset:16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_load_global_v16bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_load_b128 v[0:3], v[4:5], off +; GFX1250-NEXT: global_load_b128 v[4:7], v[4:5], off offset:16 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <16 x bfloat>, ptr addrspace(1) %ptr ret <16 x bfloat> %load } @@ -683,6 +747,19 @@ define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) { ; GFX11-NEXT: global_load_b128 v[12:15], v[12:13], off offset:48 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_load_global_v32bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v12, v0 +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_load_b128 v[0:3], v[12:13], off +; GFX1250-NEXT: global_load_b128 v[4:7], v[12:13], off offset:16 +; GFX1250-NEXT: global_load_b128 v[8:11], v[12:13], off offset:32 +; GFX1250-NEXT: global_load_b128 v[12:15], v[12:13], off offset:48 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <32 x bfloat>, ptr addrspace(1) %ptr ret <32 x bfloat> %load } @@ -973,6 +1050,23 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { ; GFX11-NEXT: global_load_b128 v[28:31], v[28:29], off offset:112 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_load_global_v64bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v29, v1 :: v_dual_mov_b32 v28, v0 +; GFX1250-NEXT: s_clause 0x7 +; GFX1250-NEXT: global_load_b128 v[0:3], v[28:29], off +; GFX1250-NEXT: global_load_b128 v[4:7], v[28:29], off offset:16 +; GFX1250-NEXT: global_load_b128 v[8:11], v[28:29], off offset:32 +; GFX1250-NEXT: global_load_b128 v[12:15], v[28:29], off offset:48 +; GFX1250-NEXT: global_load_b128 v[16:19], v[28:29], off offset:64 +; GFX1250-NEXT: global_load_b128 v[20:23], v[28:29], off offset:80 +; GFX1250-NEXT: global_load_b128 v[24:27], v[28:29], off offset:96 +; GFX1250-NEXT: global_load_b128 v[28:31], v[28:29], off offset:112 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <64 x bfloat>, ptr addrspace(1) %ptr ret <64 x bfloat> %load } @@ -1042,6 +1136,14 @@ define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v[1:2], v0, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_store_global_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-NEXT: global_store_b32 v[2:3], v0, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store <2 x bfloat> %val, ptr addrspace(1) %ptr ret void } @@ -1115,6 +1217,15 @@ define void @v_store_global_v3bf16(<3 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX11-NEXT: global_store_b16 v[2:3], v1, off offset:4 ; GFX11-NEXT: global_store_b32 v[2:3], v0, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_store_global_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b16 v[2:3], v1, off offset:4 +; GFX1250-NEXT: global_store_b32 v[2:3], v0, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store <3 x bfloat> %val, ptr addrspace(1) %ptr ret void } @@ -1183,6 +1294,13 @@ define void @v_store_global_v4bf16(<4 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_store_global_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store <4 x bfloat> %val, ptr addrspace(1) %ptr ret void } @@ -1267,6 +1385,13 @@ define void @v_store_global_v8bf16(<8 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_store_global_v8bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store <8 x bfloat> %val, ptr addrspace(1) %ptr ret void } @@ -1393,6 +1518,15 @@ define void @v_store_global_v16bf16(<16 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX11-NEXT: global_store_b128 v[8:9], v[4:7], off offset:16 ; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_store_global_v16bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[8:9], v[4:7], off offset:16 +; GFX1250-NEXT: global_store_b128 v[8:9], v[0:3], off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store <16 x bfloat> %val, ptr addrspace(1) %ptr ret void } @@ -1610,6 +1744,17 @@ define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX11-NEXT: global_store_b128 v[16:17], v[4:7], off offset:16 ; GFX11-NEXT: global_store_b128 v[16:17], v[0:3], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_store_global_v32bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[16:17], v[12:15], off offset:48 +; GFX1250-NEXT: global_store_b128 v[16:17], v[8:11], off offset:32 +; GFX1250-NEXT: global_store_b128 v[16:17], v[4:7], off offset:16 +; GFX1250-NEXT: global_store_b128 v[16:17], v[0:3], off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store <32 x bfloat> %val, ptr addrspace(1) %ptr ret void } @@ -2148,6 +2293,26 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX11-NEXT: global_store_b128 v[32:33], v[4:7], off offset:16 ; GFX11-NEXT: global_store_b128 v[32:33], v[0:3], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_store_global_v64bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x2 +; GFX1250-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX1250-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX1250-NEXT: scratch_load_b32 v31, off, s32 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_clause 0x7 +; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:112 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off offset:96 +; GFX1250-NEXT: global_store_b128 v[32:33], v[20:23], off offset:80 +; GFX1250-NEXT: global_store_b128 v[32:33], v[16:19], off offset:64 +; GFX1250-NEXT: global_store_b128 v[32:33], v[12:15], off offset:48 +; GFX1250-NEXT: global_store_b128 v[32:33], v[8:11], off offset:32 +; GFX1250-NEXT: global_store_b128 v[32:33], v[4:7], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[0:3], off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store <64 x bfloat> %val, ptr addrspace(1) %ptr ret void } @@ -2227,6 +2392,16 @@ define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) { ; GFX11FAKE16-NEXT: global_store_b16 v[0:1], v4, off ; GFX11FAKE16-NEXT: global_store_b16 v[2:3], v5, off ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_store_fpimm: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v4, 0x3f80 +; GFX1250-NEXT: v_mov_b32_e32 v5, 0x4228 +; GFX1250-NEXT: global_store_b16 v[0:1], v4, off +; GFX1250-NEXT: global_store_b16 v[2:3], v5, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store bfloat 1.0, ptr addrspace(1) %ptr0 store bfloat 42.0, ptr addrspace(1) %ptr1 ret void @@ -2330,6 +2505,16 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo ; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_load_store_f32_to_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v0, v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: global_store_b16 v[2:3], v0, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %val = load float, ptr addrspace(1) %in %val.bf16 = fptrunc float %val to bfloat store bfloat %val.bf16, ptr addrspace(1) %out @@ -2488,6 +2673,29 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc_lo ; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_load_store_f64_to_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_cvt_f32_f64_e32 v6, v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GFX1250-NEXT: v_cmp_gt_f64_e64 s0, |v[0:1]|, |v[4:5]| +; GFX1250-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[0:1], v[4:5] +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cndmask_b32_e64 v0, -1, 1, s0 +; GFX1250-NEXT: v_dual_add_nc_u32 v0, v6, v0 :: v_dual_bitop2_b32 v7, 1, v6 bitop3:0x40 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e64 s0, 1, v7 +; GFX1250-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 +; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: global_store_b16 v[2:3], v0, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %val = load double, ptr addrspace(1) %in %val.bf16 = fptrunc double %val to bfloat store bfloat %val.bf16, ptr addrspace(1) %out @@ -2560,6 +2768,16 @@ define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: global_store_b32 v[2:3], v0, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_load_store_bf16_to_f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_u16 v0, v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: global_store_b32 v[2:3], v0, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %val = load bfloat, ptr addrspace(1) %in %val.f32 = fpext bfloat %val to float store float %val.f32, ptr addrspace(1) %out @@ -2639,6 +2857,18 @@ define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_load_store_bf16_to_f64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_u16 v0, v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %val = load bfloat, ptr addrspace(1) %in %val.f64 = fpext bfloat %val to double store double %val.f64, ptr addrspace(1) %out @@ -2705,6 +2935,15 @@ define void @test_load_store_v2bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[2:3], v0, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_load_store_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v0, v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v[2:3], v0, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %val = load <2 x bfloat>, ptr addrspace(1) %in store <2 x bfloat> %val, ptr addrspace(1) %out ret void @@ -2770,6 +3009,15 @@ define void @test_load_store_v4bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_load_store_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %val = load <4 x bfloat>, ptr addrspace(1) %in store <4 x bfloat> %val, ptr addrspace(1) %out ret void @@ -2835,6 +3083,15 @@ define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[2:3], v[4:7], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_load_store_v8bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b128 v[4:7], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %val = load <8 x bfloat>, ptr addrspace(1) %in store <8 x bfloat> %val, ptr addrspace(1) %out ret void @@ -2924,6 +3181,19 @@ define void @test_load_store_v16bf16(ptr addrspace(1) %in, ptr addrspace(1) %out ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[2:3], v[8:11], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_load_store_v16bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16 +; GFX1250-NEXT: global_load_b128 v[8:11], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x1 +; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off offset:16 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b128 v[2:3], v[8:11], off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %val = load <16 x bfloat>, ptr addrspace(1) %in store <16 x bfloat> %val, ptr addrspace(1) %out ret void @@ -2990,6 +3260,14 @@ define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v[1:2], v0, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_arg_store: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-NEXT: global_store_b16 v[2:3], v0, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store bfloat %in, ptr addrspace(1) %out ret void } @@ -3059,6 +3337,14 @@ define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v[1:2], v0, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_arg_store_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-NEXT: global_store_b32 v[2:3], v0, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store <2 x bfloat> %in, ptr addrspace(1) %out ret void } @@ -3132,6 +3418,15 @@ define void @test_arg_store_v3bf16(<3 x bfloat> %in, ptr addrspace(1) %out) { ; GFX11-NEXT: global_store_b16 v[2:3], v1, off offset:4 ; GFX11-NEXT: global_store_b32 v[2:3], v0, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_arg_store_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b16 v[2:3], v1, off offset:4 +; GFX1250-NEXT: global_store_b32 v[2:3], v0, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store <3 x bfloat> %in, ptr addrspace(1) %out ret void } @@ -3200,6 +3495,13 @@ define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_arg_store_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store <4 x bfloat> %in, ptr addrspace(1) %out ret void } @@ -3284,6 +3586,13 @@ define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_arg_store_v8bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store <8 x bfloat> %in, ptr addrspace(1) %out ret void } @@ -3410,6 +3719,15 @@ define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) { ; GFX11-NEXT: global_store_b128 v[8:9], v[4:7], off offset:16 ; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_arg_store_v16bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[8:9], v[4:7], off offset:16 +; GFX1250-NEXT: global_store_b128 v[8:9], v[0:3], off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store <16 x bfloat> %in, ptr addrspace(1) %out ret void } @@ -3477,6 +3795,14 @@ define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1) ; GFX11FAKE16-NEXT: v_mov_b32_e32 v2, s4 ; GFX11FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_inreg_arg_store: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: global_store_b16 v[0:1], v2, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store bfloat %in, ptr addrspace(1) %out ret void } @@ -3539,6 +3865,13 @@ define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) { ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11FAKE16-NEXT: scratch_store_b16 off, v0, s32 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_byval: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_store_b16 off, v0, s32 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store bfloat %val, ptr addrspace(5) %bv %retval = load bfloat, ptr addrspace(5) %bv ret bfloat %retval @@ -3595,6 +3928,13 @@ define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: scratch_store_b16 v0, v1, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_sret: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_store_b16 v0, v1, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store bfloat %val, ptr addrspace(5) %sret ret void } @@ -3667,6 +4007,15 @@ define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %ou ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11FAKE16-NEXT: global_store_b16 v[2:3], v0, off ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_bitcast_from_bfloat: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_u16 v0, v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b16 v[2:3], v0, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %val = load bfloat, ptr addrspace(1) %in %val_int = bitcast bfloat %val to i16 store i16 %val_int, ptr addrspace(1) %out @@ -3741,6 +4090,15 @@ define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_bitcast_to_bfloat: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_u16 v2, v[2:3], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b16 v[0:1], v2, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %val = load i16, ptr addrspace(1) %in %val_fp = bitcast i16 %val to bfloat store bfloat %val_fp, ptr addrspace(1) %out @@ -3777,6 +4135,12 @@ define bfloat @test_ret(bfloat %in) { ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_ret: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: ret bfloat %in } @@ -3811,6 +4175,12 @@ define <2 x bfloat> @test_ret_v2bf16(<2 x bfloat> %in) { ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_ret_v2bf16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: ret <2 x bfloat> %in } @@ -3845,6 +4215,12 @@ define <3 x bfloat> @test_ret_v3bf16(<3 x bfloat> %in) { ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_ret_v3bf16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: ret <3 x bfloat> %in } @@ -3879,6 +4255,12 @@ define <4 x bfloat> @test_ret_v4bf16(<4 x bfloat> %in) { ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_ret_v4bf16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: ret <4 x bfloat> %in } @@ -3913,6 +4295,12 @@ define <8 x bfloat> @test_ret_v8bf16(<8 x bfloat> %in) { ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_ret_v8bf16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: ret <8 x bfloat> %in } @@ -3947,6 +4335,12 @@ define <16 x bfloat> @test_ret_v16bf16(<16 x bfloat> %in) { ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_ret_v16bf16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: ret <16 x bfloat> %in } @@ -4161,6 +4555,38 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_call: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s2, s33 +; GFX1250-NEXT: s_mov_b32 s33, s32 +; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX1250-NEXT: scratch_store_b32 off, v4, s33 ; 4-byte Folded Spill +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 exec_lo, s0 +; GFX1250-NEXT: s_get_pc_i64 s[0:1] +; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store@gotpcrel+4 +; GFX1250-NEXT: v_writelane_b32 v4, s30, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1250-NEXT: s_add_co_i32 s32, s32, 16 +; GFX1250-NEXT: v_writelane_b32 v4, s31, 1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1] +; GFX1250-NEXT: scratch_store_b16 v1, v0, off scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: v_readlane_b32 s31, v4, 1 +; GFX1250-NEXT: v_readlane_b32 s30, v4, 0 +; GFX1250-NEXT: s_mov_b32 s32, s33 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX1250-NEXT: scratch_load_b32 v4, off, s33 ; 4-byte Folded Reload +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 exec_lo, s0 +; GFX1250-NEXT: s_mov_b32 s33, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %result = call bfloat @test_arg_store(bfloat %in) store volatile bfloat %result, ptr addrspace(5) %out @@ -4387,6 +4813,38 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_call_v2bf16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s2, s33 +; GFX1250-NEXT: s_mov_b32 s33, s32 +; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX1250-NEXT: scratch_store_b32 off, v4, s33 ; 4-byte Folded Spill +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 exec_lo, s0 +; GFX1250-NEXT: s_get_pc_i64 s[0:1] +; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store_v2bf16@gotpcrel+4 +; GFX1250-NEXT: v_writelane_b32 v4, s30, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1250-NEXT: s_add_co_i32 s32, s32, 16 +; GFX1250-NEXT: v_writelane_b32 v4, s31, 1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1] +; GFX1250-NEXT: scratch_store_b32 v1, v0, off scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: v_readlane_b32 s31, v4, 1 +; GFX1250-NEXT: v_readlane_b32 s30, v4, 0 +; GFX1250-NEXT: s_mov_b32 s32, s33 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX1250-NEXT: scratch_load_b32 v4, off, s33 ; 4-byte Folded Reload +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 exec_lo, s0 +; GFX1250-NEXT: s_mov_b32 s33, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %result = call <2 x bfloat> @test_arg_store_v2bf16(<2 x bfloat> %in) store volatile <2 x bfloat> %result, ptr addrspace(5) %out @@ -4629,6 +5087,41 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_call_v3bf16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s2, s33 +; GFX1250-NEXT: s_mov_b32 s33, s32 +; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX1250-NEXT: scratch_store_b32 off, v5, s33 ; 4-byte Folded Spill +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 exec_lo, s0 +; GFX1250-NEXT: s_get_pc_i64 s[0:1] +; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store_v2bf16@gotpcrel+4 +; GFX1250-NEXT: v_writelane_b32 v5, s30, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1250-NEXT: s_add_co_i32 s32, s32, 16 +; GFX1250-NEXT: v_mov_b32_e32 v4, v2 +; GFX1250-NEXT: v_writelane_b32 v5, s31, 1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1] +; GFX1250-NEXT: scratch_store_b16 v4, v1, off offset:4 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: scratch_store_b32 v4, v0, off scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: v_readlane_b32 s31, v5, 1 +; GFX1250-NEXT: v_readlane_b32 s30, v5, 0 +; GFX1250-NEXT: s_mov_b32 s32, s33 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX1250-NEXT: scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 exec_lo, s0 +; GFX1250-NEXT: s_mov_b32 s33, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %result = call <3 x bfloat> @test_arg_store_v2bf16(<3 x bfloat> %in) store volatile <3 x bfloat> %result, ptr addrspace(5) %out @@ -4883,6 +5376,39 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_call_v4bf16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s2, s33 +; GFX1250-NEXT: s_mov_b32 s33, s32 +; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX1250-NEXT: scratch_store_b32 off, v5, s33 ; 4-byte Folded Spill +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 exec_lo, s0 +; GFX1250-NEXT: s_get_pc_i64 s[0:1] +; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store_v2bf16@gotpcrel+4 +; GFX1250-NEXT: v_writelane_b32 v5, s30, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1250-NEXT: s_add_co_i32 s32, s32, 16 +; GFX1250-NEXT: v_mov_b32_e32 v4, v2 +; GFX1250-NEXT: v_writelane_b32 v5, s31, 1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1] +; GFX1250-NEXT: scratch_store_b64 v4, v[0:1], off scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: v_readlane_b32 s31, v5, 1 +; GFX1250-NEXT: v_readlane_b32 s30, v5, 0 +; GFX1250-NEXT: s_mov_b32 s32, s33 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX1250-NEXT: scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 exec_lo, s0 +; GFX1250-NEXT: s_mov_b32 s33, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %result = call <4 x bfloat> @test_arg_store_v2bf16(<4 x bfloat> %in) store volatile <4 x bfloat> %result, ptr addrspace(5) %out @@ -5190,6 +5716,38 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_call_v8bf16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s2, s33 +; GFX1250-NEXT: s_mov_b32 s33, s32 +; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX1250-NEXT: scratch_store_b32 off, v5, s33 ; 4-byte Folded Spill +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 exec_lo, s0 +; GFX1250-NEXT: s_get_pc_i64 s[0:1] +; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store_v2bf16@gotpcrel+4 +; GFX1250-NEXT: v_writelane_b32 v5, s30, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1250-NEXT: s_add_co_i32 s32, s32, 16 +; GFX1250-NEXT: v_writelane_b32 v5, s31, 1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1] +; GFX1250-NEXT: scratch_store_b128 v4, v[0:3], off scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: v_readlane_b32 s31, v5, 1 +; GFX1250-NEXT: v_readlane_b32 s30, v5, 0 +; GFX1250-NEXT: s_mov_b32 s32, s33 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX1250-NEXT: scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 exec_lo, s0 +; GFX1250-NEXT: s_mov_b32 s33, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %result = call <8 x bfloat> @test_arg_store_v2bf16(<8 x bfloat> %in) store volatile <8 x bfloat> %result, ptr addrspace(5) %out @@ -5609,6 +6167,40 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_call_v16bf16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s2, s33 +; GFX1250-NEXT: s_mov_b32 s33, s32 +; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX1250-NEXT: scratch_store_b32 off, v9, s33 ; 4-byte Folded Spill +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 exec_lo, s0 +; GFX1250-NEXT: s_get_pc_i64 s[0:1] +; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store_v2bf16@gotpcrel+4 +; GFX1250-NEXT: v_writelane_b32 v9, s30, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1250-NEXT: s_add_co_i32 s32, s32, 16 +; GFX1250-NEXT: v_writelane_b32 v9, s31, 1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1] +; GFX1250-NEXT: scratch_store_b128 v8, v[4:7], off offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: scratch_store_b128 v8, v[0:3], off scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: v_readlane_b32 s31, v9, 1 +; GFX1250-NEXT: v_readlane_b32 s30, v9, 0 +; GFX1250-NEXT: s_mov_b32 s32, s33 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX1250-NEXT: scratch_load_b32 v9, off, s33 ; 4-byte Folded Reload +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 exec_lo, s0 +; GFX1250-NEXT: s_mov_b32 s33, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %result = call <16 x bfloat> @test_arg_store_v2bf16(<16 x bfloat> %in) store volatile <16 x bfloat> %result, ptr addrspace(5) %out @@ -5693,6 +6285,16 @@ define bfloat @test_alloca_load_store_ret(bfloat %in) { ; GFX11FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_alloca_load_store_ret: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %in.addr = alloca bfloat, align 2, addrspace(5) store volatile bfloat %in, ptr addrspace(5) %in.addr, align 2 @@ -6105,6 +6707,28 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) { ; GFX11FAKE16-NEXT: scratch_store_b128 v0, v[26:29], off offset:96 ; GFX11FAKE16-NEXT: scratch_store_b16 v0, v1, off offset:128 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_overflow_stack: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x2 +; GFX1250-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX1250-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX1250-NEXT: scratch_load_b32 v31, off, s32 +; GFX1250-NEXT: s_clause 0x5 +; GFX1250-NEXT: scratch_store_b128 v0, v[22:25], off offset:80 +; GFX1250-NEXT: scratch_store_b128 v0, v[18:21], off offset:64 +; GFX1250-NEXT: scratch_store_b128 v0, v[14:17], off offset:48 +; GFX1250-NEXT: scratch_store_b128 v0, v[10:13], off offset:32 +; GFX1250-NEXT: scratch_store_b128 v0, v[6:9], off offset:16 +; GFX1250-NEXT: scratch_store_b128 v0, v[2:5], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_clause 0x2 +; GFX1250-NEXT: scratch_store_b128 v0, v[30:33], off offset:112 +; GFX1250-NEXT: scratch_store_b128 v0, v[26:29], off offset:96 +; GFX1250-NEXT: scratch_store_b16 v0, v1, off offset:128 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %ins.0 = insertvalue { <32 x i32>, bfloat } poison, <32 x i32> %b, 0 %ins.1 = insertvalue { <32 x i32>, bfloat } %ins.0 ,bfloat %a, 1 ret { <32 x i32>, bfloat } %ins.1 @@ -6172,6 +6796,16 @@ define <2 x float> @global_extload_v2bf16_to_v2f32(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v2bf16_to_v2f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <2 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <2 x bfloat> %load to <2 x float> ret <2 x float> %fpext @@ -6255,6 +6889,17 @@ define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v3bf16_to_v3f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[2:3], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <3 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <3 x bfloat> %load to <3 x float> ret <3 x float> %fpext @@ -6334,6 +6979,18 @@ define <4 x float> @global_extload_v4bf16_to_v4f32(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v4bf16_to_v4f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[2:3], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <4 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <4 x bfloat> %load to <4 x float> ret <4 x float> %fpext @@ -6423,6 +7080,19 @@ define <5 x float> @global_extload_v5bf16_to_v5f32(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v5bf16_to_v5f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b128 v[2:5], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX1250-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <5 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <5 x bfloat> %load to <5 x float> ret <5 x float> %fpext @@ -6527,6 +7197,19 @@ define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v6bf16_to_v6f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b96 v[4:6], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v4 :: v_dual_lshlrev_b32 v2, 16, v5 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX1250-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <6 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <6 x bfloat> %load to <6 x float> ret <6 x float> %fpext @@ -6630,6 +7313,21 @@ define <8 x float> @global_extload_v8bf16_to_v8f32(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v7 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v8bf16_to_v8f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b128 v[4:7], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v4 :: v_dual_lshlrev_b32 v2, 16, v5 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX1250-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX1250-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <8 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <8 x bfloat> %load to <8 x float> ret <8 x float> %fpext @@ -6797,6 +7495,32 @@ define <16 x float> @global_extload_v16bf16_to_v16f32(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v16bf16_to_v16f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_load_b128 v[4:7], v[0:1], off +; GFX1250-NEXT: global_load_b128 v[12:15], v[0:1], off offset:16 +; GFX1250-NEXT: s_wait_loadcnt 0x1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v4 :: v_dual_lshlrev_b32 v2, 16, v5 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX1250-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX1250-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v8, 16, v12 :: v_dual_lshlrev_b32 v10, 16, v13 +; GFX1250-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 +; GFX1250-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 +; GFX1250-NEXT: v_lshlrev_b32_e32 v12, 16, v14 +; GFX1250-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 +; GFX1250-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; GFX1250-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <16 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <16 x bfloat> %load to <16 x float> ret <16 x float> %fpext @@ -7088,6 +7812,50 @@ define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v31 ; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v32bf16_to_v32f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_load_b128 v[4:7], v[0:1], off +; GFX1250-NEXT: global_load_b128 v[12:15], v[0:1], off offset:16 +; GFX1250-NEXT: global_load_b128 v[20:23], v[0:1], off offset:32 +; GFX1250-NEXT: global_load_b128 v[28:31], v[0:1], off offset:48 +; GFX1250-NEXT: s_wait_loadcnt 0x3 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v4 :: v_dual_lshlrev_b32 v2, 16, v5 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX1250-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX1250-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX1250-NEXT: s_wait_loadcnt 0x2 +; GFX1250-NEXT: v_dual_lshlrev_b32 v8, 16, v12 :: v_dual_lshlrev_b32 v10, 16, v13 +; GFX1250-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 +; GFX1250-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 +; GFX1250-NEXT: v_lshlrev_b32_e32 v12, 16, v14 +; GFX1250-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 +; GFX1250-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; GFX1250-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX1250-NEXT: s_wait_loadcnt 0x1 +; GFX1250-NEXT: v_dual_lshlrev_b32 v16, 16, v20 :: v_dual_lshlrev_b32 v18, 16, v21 +; GFX1250-NEXT: v_and_b32_e32 v17, 0xffff0000, v20 +; GFX1250-NEXT: v_and_b32_e32 v19, 0xffff0000, v21 +; GFX1250-NEXT: v_lshlrev_b32_e32 v20, 16, v22 +; GFX1250-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 +; GFX1250-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; GFX1250-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v24, 16, v28 :: v_dual_lshlrev_b32 v26, 16, v29 +; GFX1250-NEXT: v_and_b32_e32 v25, 0xffff0000, v28 +; GFX1250-NEXT: v_and_b32_e32 v27, 0xffff0000, v29 +; GFX1250-NEXT: v_lshlrev_b32_e32 v28, 16, v30 +; GFX1250-NEXT: v_and_b32_e32 v29, 0xffff0000, v30 +; GFX1250-NEXT: v_lshlrev_b32_e32 v30, 16, v31 +; GFX1250-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <32 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <32 x bfloat> %load to <32 x float> ret <32 x float> %fpext @@ -7179,6 +7947,19 @@ define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v2bf16_to_v2f64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v0, v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <2 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <2 x bfloat> %load to <2 x double> ret <2 x double> %fpext @@ -7285,6 +8066,22 @@ define <3 x double> @global_extload_v3bf16_to_v3f64(ptr addrspace(1) %ptr) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v3bf16_to_v3f64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v4, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <3 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <3 x bfloat> %load to <3 x double> ret <3 x double> %fpext @@ -7390,6 +8187,23 @@ define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v4bf16_to_v4f64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[2:3], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <4 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <4 x bfloat> %load to <4 x double> ret <4 x double> %fpext @@ -7509,6 +8323,24 @@ define <5 x double> @global_extload_v5bf16_to_v5f64(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v5bf16_to_v5f64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b128 v[2:5], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v2 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX1250-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <5 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <5 x bfloat> %load to <5 x double> ret <5 x double> %fpext @@ -7636,6 +8468,26 @@ define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v6bf16_to_v6f64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b96 v[4:6], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; GFX1250-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX1250-NEXT: v_lshlrev_b32_e32 v8, 16, v6 +; GFX1250-NEXT: v_and_b32_e32 v10, 0xffff0000, v6 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <6 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <6 x bfloat> %load to <6 x double> ret <6 x double> %fpext @@ -7787,6 +8639,28 @@ define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v8bf16_to_v8f64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b128 v[8:11], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v8 :: v_dual_lshlrev_b32 v4, 16, v9 +; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 +; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v9 +; GFX1250-NEXT: v_dual_lshlrev_b32 v8, 16, v10 :: v_dual_lshlrev_b32 v12, 16, v11 +; GFX1250-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX1250-NEXT: v_and_b32_e32 v14, 0xffff0000, v11 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <8 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <8 x bfloat> %load to <8 x double> ret <8 x double> %fpext @@ -8050,6 +8924,46 @@ define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_cvt_f64_f32_e32 v[28:29], v28 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[30:31], v30 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v16bf16_to_v16f64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_load_b128 v[8:11], v[0:1], off +; GFX1250-NEXT: global_load_b128 v[24:27], v[0:1], off offset:16 +; GFX1250-NEXT: s_wait_loadcnt 0x1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v8 :: v_dual_lshlrev_b32 v4, 16, v9 +; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 +; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v9 +; GFX1250-NEXT: v_dual_lshlrev_b32 v8, 16, v10 :: v_dual_lshlrev_b32 v12, 16, v11 +; GFX1250-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX1250-NEXT: v_and_b32_e32 v14, 0xffff0000, v11 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v16, 16, v24 :: v_dual_lshlrev_b32 v20, 16, v25 +; GFX1250-NEXT: v_and_b32_e32 v18, 0xffff0000, v24 +; GFX1250-NEXT: v_and_b32_e32 v22, 0xffff0000, v25 +; GFX1250-NEXT: v_dual_lshlrev_b32 v24, 16, v26 :: v_dual_lshlrev_b32 v28, 16, v27 +; GFX1250-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX1250-NEXT: v_and_b32_e32 v30, 0xffff0000, v27 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[18:19], v18 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[20:21], v20 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[22:23], v22 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[24:25], v24 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[26:27], v26 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[28:29], v28 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[30:31], v30 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <16 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <16 x bfloat> %load to <16 x double> ret <16 x double> %fpext @@ -9570,6 +10484,131 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v32bf16_to_v32f64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-NEXT: s_clause 0x1f +; GFX1250-NEXT: global_load_u16 v1, v[2:3], off offset:2 +; GFX1250-NEXT: global_load_u16 v4, v[2:3], off offset:12 +; GFX1250-NEXT: global_load_u16 v5, v[2:3], off offset:8 +; GFX1250-NEXT: global_load_u16 v6, v[2:3], off offset:4 +; GFX1250-NEXT: global_load_u16 v7, v[2:3], off +; GFX1250-NEXT: global_load_u16 v8, v[2:3], off offset:6 +; GFX1250-NEXT: global_load_u16 v9, v[2:3], off offset:10 +; GFX1250-NEXT: global_load_u16 v10, v[2:3], off offset:14 +; GFX1250-NEXT: global_load_u16 v11, v[2:3], off offset:18 +; GFX1250-NEXT: global_load_u16 v12, v[2:3], off offset:62 +; GFX1250-NEXT: global_load_u16 v13, v[2:3], off offset:60 +; GFX1250-NEXT: global_load_u16 v14, v[2:3], off offset:58 +; GFX1250-NEXT: global_load_u16 v15, v[2:3], off offset:56 +; GFX1250-NEXT: global_load_u16 v16, v[2:3], off offset:28 +; GFX1250-NEXT: global_load_u16 v17, v[2:3], off offset:24 +; GFX1250-NEXT: global_load_u16 v18, v[2:3], off offset:20 +; GFX1250-NEXT: global_load_u16 v19, v[2:3], off offset:16 +; GFX1250-NEXT: global_load_u16 v20, v[2:3], off offset:22 +; GFX1250-NEXT: global_load_u16 v21, v[2:3], off offset:26 +; GFX1250-NEXT: global_load_u16 v22, v[2:3], off offset:30 +; GFX1250-NEXT: global_load_u16 v23, v[2:3], off offset:34 +; GFX1250-NEXT: global_load_u16 v24, v[2:3], off offset:44 +; GFX1250-NEXT: global_load_u16 v25, v[2:3], off offset:40 +; GFX1250-NEXT: global_load_u16 v26, v[2:3], off offset:36 +; GFX1250-NEXT: global_load_u16 v27, v[2:3], off offset:32 +; GFX1250-NEXT: global_load_u16 v28, v[2:3], off offset:38 +; GFX1250-NEXT: global_load_u16 v29, v[2:3], off offset:42 +; GFX1250-NEXT: global_load_u16 v30, v[2:3], off offset:46 +; GFX1250-NEXT: global_load_u16 v31, v[2:3], off offset:50 +; GFX1250-NEXT: global_load_u16 v32, v[2:3], off offset:52 +; GFX1250-NEXT: global_load_u16 v33, v[2:3], off offset:48 +; GFX1250-NEXT: global_load_u16 v34, v[2:3], off offset:54 +; GFX1250-NEXT: s_wait_loadcnt 0x1e +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v37, 16, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x1c +; GFX1250-NEXT: v_dual_lshlrev_b32 v81, 16, v5 :: v_dual_lshlrev_b32 v85, 16, v6 +; GFX1250-NEXT: s_wait_loadcnt 0x1a +; GFX1250-NEXT: v_dual_lshlrev_b32 v84, 16, v7 :: v_dual_lshlrev_b32 v35, 16, v8 +; GFX1250-NEXT: s_wait_loadcnt 0x18 +; GFX1250-NEXT: v_dual_lshlrev_b32 v80, 16, v9 :: v_dual_lshlrev_b32 v36, 16, v10 +; GFX1250-NEXT: s_wait_loadcnt 0x15 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v12 :: v_dual_lshlrev_b32 v3, 16, v13 +; GFX1250-NEXT: s_wait_loadcnt 0x14 +; GFX1250-NEXT: v_dual_lshlrev_b32 v12, 16, v11 :: v_dual_lshlrev_b32 v6, 16, v14 +; GFX1250-NEXT: s_wait_loadcnt 0x13 +; GFX1250-NEXT: v_lshlrev_b32_e32 v7, 16, v15 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v2 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; GFX1250-NEXT: s_wait_loadcnt 0x11 +; GFX1250-NEXT: v_dual_lshlrev_b32 v68, 16, v17 :: v_dual_lshlrev_b32 v39, 16, v16 +; GFX1250-NEXT: s_wait_loadcnt 0xe +; GFX1250-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 +; GFX1250-NEXT: s_wait_loadcnt 0xc +; GFX1250-NEXT: v_dual_lshlrev_b32 v21, 16, v21 :: v_dual_lshlrev_b32 v38, 16, v22 +; GFX1250-NEXT: s_wait_loadcnt 0x9 +; GFX1250-NEXT: v_dual_lshlrev_b32 v48, 16, v23 :: v_dual_lshlrev_b32 v25, 16, v25 +; GFX1250-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX1250-NEXT: s_wait_loadcnt 0x5 +; GFX1250-NEXT: v_dual_lshlrev_b32 v49, 16, v28 :: v_dual_lshlrev_b32 v64, 16, v29 +; GFX1250-NEXT: s_wait_loadcnt 0x3 +; GFX1250-NEXT: v_dual_lshlrev_b32 v50, 16, v30 :: v_dual_lshlrev_b32 v51, 16, v31 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v33, 16, v33 :: v_dual_lshlrev_b32 v52, 16, v34 +; GFX1250-NEXT: v_dual_lshlrev_b32 v32, 16, v32 :: v_dual_lshlrev_b32 v69, 16, v27 +; GFX1250-NEXT: v_lshlrev_b32_e32 v70, 16, v26 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[14:15], v35 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[54:55], v52 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[52:53], v32 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[30:31], v38 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[28:29], v39 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[34:35], v48 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[38:39], v49 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[48:49], v33 +; GFX1250-NEXT: v_dual_lshlrev_b32 v13, 16, v19 :: v_dual_lshlrev_b32 v82, 16, v18 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[66:67], v64 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[64:65], v25 +; GFX1250-NEXT: scratch_store_b128 v0, v[2:5], off offset:240 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v50 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[50:51], v51 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[2:3], v24 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[18:19], v36 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[16:17], v37 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[36:37], v70 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[32:33], v69 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[70:71], v21 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[68:69], v68 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[26:27], v20 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[24:25], v82 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[22:23], v12 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[20:21], v13 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[82:83], v80 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[80:81], v81 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[12:13], v85 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[10:11], v1 +; GFX1250-NEXT: scratch_store_b128 v0, v[6:9], off offset:224 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[8:9], v84 +; GFX1250-NEXT: s_clause 0xd +; GFX1250-NEXT: scratch_store_b128 v0, v[52:55], off offset:208 +; GFX1250-NEXT: scratch_store_b128 v0, v[48:51], off offset:192 +; GFX1250-NEXT: scratch_store_b128 v0, v[2:5], off offset:176 +; GFX1250-NEXT: scratch_store_b128 v0, v[64:67], off offset:160 +; GFX1250-NEXT: scratch_store_b128 v0, v[36:39], off offset:144 +; GFX1250-NEXT: scratch_store_b128 v0, v[32:35], off offset:128 +; GFX1250-NEXT: scratch_store_b128 v0, v[28:31], off offset:112 +; GFX1250-NEXT: scratch_store_b128 v0, v[68:71], off offset:96 +; GFX1250-NEXT: scratch_store_b128 v0, v[24:27], off offset:80 +; GFX1250-NEXT: scratch_store_b128 v0, v[20:23], off offset:64 +; GFX1250-NEXT: scratch_store_b128 v0, v[16:19], off offset:48 +; GFX1250-NEXT: scratch_store_b128 v0, v[80:83], off offset:32 +; GFX1250-NEXT: scratch_store_b128 v0, v[12:15], off offset:16 +; GFX1250-NEXT: scratch_store_b128 v0, v[8:11], off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <32 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <32 x bfloat> %load to <32 x double> ret <32 x double> %fpext @@ -9686,6 +10725,16 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fadd_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fadd bfloat %a, %b ret bfloat %op } @@ -9859,6 +10908,13 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fadd_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fadd <2 x bfloat> %a, %b ret <2 x bfloat> %op } @@ -10093,6 +11149,14 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fadd_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v2 +; GFX1250-NEXT: v_pk_add_bf16 v1, v1, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fadd <3 x bfloat> %a, %b ret <3 x bfloat> %op } @@ -10383,6 +11447,14 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fadd_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v2 +; GFX1250-NEXT: v_pk_add_bf16 v1, v1, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fadd <4 x bfloat> %a, %b ret <4 x bfloat> %op } @@ -10921,6 +11993,16 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fadd_v8bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v4 +; GFX1250-NEXT: v_pk_add_bf16 v1, v1, v5 +; GFX1250-NEXT: v_pk_add_bf16 v2, v2, v6 +; GFX1250-NEXT: v_pk_add_bf16 v3, v3, v7 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fadd <8 x bfloat> %a, %b ret <8 x bfloat> %op } @@ -11951,6 +13033,20 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v4, v4, v13, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fadd_v16bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v8 +; GFX1250-NEXT: v_pk_add_bf16 v1, v1, v9 +; GFX1250-NEXT: v_pk_add_bf16 v2, v2, v10 +; GFX1250-NEXT: v_pk_add_bf16 v3, v3, v11 +; GFX1250-NEXT: v_pk_add_bf16 v4, v4, v12 +; GFX1250-NEXT: v_pk_add_bf16 v5, v5, v13 +; GFX1250-NEXT: v_pk_add_bf16 v6, v6, v14 +; GFX1250-NEXT: v_pk_add_bf16 v7, v7, v15 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fadd <16 x bfloat> %a, %b ret <16 x bfloat> %op } @@ -14043,6 +15139,30 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fadd_v32bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_b32 v31, off, s32 +; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v16 +; GFX1250-NEXT: v_pk_add_bf16 v1, v1, v17 +; GFX1250-NEXT: v_pk_add_bf16 v2, v2, v18 +; GFX1250-NEXT: v_pk_add_bf16 v3, v3, v19 +; GFX1250-NEXT: v_pk_add_bf16 v4, v4, v20 +; GFX1250-NEXT: v_pk_add_bf16 v5, v5, v21 +; GFX1250-NEXT: v_pk_add_bf16 v6, v6, v22 +; GFX1250-NEXT: v_pk_add_bf16 v7, v7, v23 +; GFX1250-NEXT: v_pk_add_bf16 v8, v8, v24 +; GFX1250-NEXT: v_pk_add_bf16 v9, v9, v25 +; GFX1250-NEXT: v_pk_add_bf16 v10, v10, v26 +; GFX1250-NEXT: v_pk_add_bf16 v11, v11, v27 +; GFX1250-NEXT: v_pk_add_bf16 v12, v12, v28 +; GFX1250-NEXT: v_pk_add_bf16 v13, v13, v29 +; GFX1250-NEXT: v_pk_add_bf16 v14, v14, v30 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_pk_add_bf16 v15, v15, v31 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fadd <32 x bfloat> %a, %b ret <32 x bfloat> %op } @@ -14147,6 +15267,16 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fadd_bf16_fpimm_0: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %add = fadd bfloat %arg0, 1.0 ret bfloat %add } @@ -14251,6 +15381,16 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fadd_bf16_fpimm_1: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_add_f32_e32 v0, 0x42280000, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %add = fadd bfloat %arg0, 42.0 ret bfloat %add } @@ -14366,6 +15506,16 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fsub_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fsub bfloat %a, %b ret bfloat %op } @@ -14539,6 +15689,13 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fsub_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fsub <2 x bfloat> %a, %b ret <2 x bfloat> %op } @@ -14773,6 +15930,22 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fsub_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX1250-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v2 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_sub_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v4 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fsub <3 x bfloat> %a, %b ret <3 x bfloat> %op } @@ -15063,6 +16236,25 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fsub_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX1250-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v2 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: v_dual_sub_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX1250-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v5 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v4 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fsub <4 x bfloat> %a, %b ret <4 x bfloat> %op } @@ -15178,6 +16370,13 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fmul_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, 0 op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fmul bfloat %a, %b ret bfloat %op } @@ -15351,6 +16550,13 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fmul_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_mul_bf16 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fmul <2 x bfloat> %a, %b ret <2 x bfloat> %op } @@ -15585,6 +16791,14 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fmul_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_mul_bf16 v0, v0, v2 +; GFX1250-NEXT: v_pk_mul_bf16 v1, v1, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fmul <3 x bfloat> %a, %b ret <3 x bfloat> %op } @@ -15875,6 +17089,14 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fmul_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_mul_bf16 v0, v0, v2 +; GFX1250-NEXT: v_pk_mul_bf16 v1, v1, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fmul <4 x bfloat> %a, %b ret <4 x bfloat> %op } @@ -16413,6 +17635,16 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fmul_v8bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_mul_bf16 v0, v0, v4 +; GFX1250-NEXT: v_pk_mul_bf16 v1, v1, v5 +; GFX1250-NEXT: v_pk_mul_bf16 v2, v2, v6 +; GFX1250-NEXT: v_pk_mul_bf16 v3, v3, v7 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fmul <8 x bfloat> %a, %b ret <8 x bfloat> %op } @@ -17443,6 +18675,20 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v4, v4, v13, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fmul_v16bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_mul_bf16 v0, v0, v8 +; GFX1250-NEXT: v_pk_mul_bf16 v1, v1, v9 +; GFX1250-NEXT: v_pk_mul_bf16 v2, v2, v10 +; GFX1250-NEXT: v_pk_mul_bf16 v3, v3, v11 +; GFX1250-NEXT: v_pk_mul_bf16 v4, v4, v12 +; GFX1250-NEXT: v_pk_mul_bf16 v5, v5, v13 +; GFX1250-NEXT: v_pk_mul_bf16 v6, v6, v14 +; GFX1250-NEXT: v_pk_mul_bf16 v7, v7, v15 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fmul <16 x bfloat> %a, %b ret <16 x bfloat> %op } @@ -19535,6 +20781,30 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fmul_v32bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_b32 v31, off, s32 +; GFX1250-NEXT: v_pk_mul_bf16 v0, v0, v16 +; GFX1250-NEXT: v_pk_mul_bf16 v1, v1, v17 +; GFX1250-NEXT: v_pk_mul_bf16 v2, v2, v18 +; GFX1250-NEXT: v_pk_mul_bf16 v3, v3, v19 +; GFX1250-NEXT: v_pk_mul_bf16 v4, v4, v20 +; GFX1250-NEXT: v_pk_mul_bf16 v5, v5, v21 +; GFX1250-NEXT: v_pk_mul_bf16 v6, v6, v22 +; GFX1250-NEXT: v_pk_mul_bf16 v7, v7, v23 +; GFX1250-NEXT: v_pk_mul_bf16 v8, v8, v24 +; GFX1250-NEXT: v_pk_mul_bf16 v9, v9, v25 +; GFX1250-NEXT: v_pk_mul_bf16 v10, v10, v26 +; GFX1250-NEXT: v_pk_mul_bf16 v11, v11, v27 +; GFX1250-NEXT: v_pk_mul_bf16 v12, v12, v28 +; GFX1250-NEXT: v_pk_mul_bf16 v13, v13, v29 +; GFX1250-NEXT: v_pk_mul_bf16 v14, v14, v30 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_pk_mul_bf16 v15, v15, v31 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fmul <32 x bfloat> %a, %b ret <32 x bfloat> %op } @@ -19741,6 +21011,32 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fdiv_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 +; GFX1250-NEXT: v_rcp_f32_e32 v3, v2 +; GFX1250-NEXT: v_nop +; GFX1250-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; GFX1250-NEXT: v_fmac_f32_e32 v3, v4, v3 +; GFX1250-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX1250-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX1250-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX1250-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fdiv bfloat %a, %b ret bfloat %op } @@ -19795,6 +21091,13 @@ define bfloat @v_fabs_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fabs_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.fabs.bf16(bfloat %a) ret bfloat %op } @@ -19838,6 +21141,13 @@ define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_fabs_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_and_b32 s0, s0, 0x7fff +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX1250-NEXT: ; return to shader part epilog %op = call bfloat @llvm.fabs.bf16(bfloat %a) %cast = bitcast bfloat %op to i16 %zext = zext i16 %cast to i32 @@ -19887,6 +21197,13 @@ define bfloat @v_fneg_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fneg_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fneg bfloat %a ret bfloat %op } @@ -19933,6 +21250,13 @@ define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_fneg_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_xor_b32 s0, s0, 0x8000 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX1250-NEXT: ; return to shader part epilog %op = fneg bfloat %a %cast = bitcast bfloat %op to i16 %zext = zext i16 %cast to i32 @@ -19992,6 +21316,13 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11FAKE16-NEXT: v_or_b32_e32 v0, 0x8000, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fneg_fabs_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %fabs = call bfloat @llvm.fabs.bf16(bfloat %a) %op = fneg bfloat %fabs ret bfloat %op @@ -20045,6 +21376,13 @@ define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_fneg_fabs_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_bitset1_b32 s0, 15 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX1250-NEXT: ; return to shader part epilog %fabs = call bfloat @llvm.fabs.bf16(bfloat %a) %op = fneg bfloat %fabs %cast = bitcast bfloat %op to i16 @@ -20172,6 +21510,16 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_minnum_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_min_num_f32_e32 v0, v0, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.minnum.bf16(bfloat %a, bfloat %b) ret bfloat %op } @@ -20345,6 +21693,13 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_minnum_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_min_num_bf16 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) ret <2 x bfloat> %op } @@ -20579,6 +21934,14 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_minnum_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_min_num_bf16 v0, v0, v2 +; GFX1250-NEXT: v_pk_min_num_bf16 v1, v1, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) ret <3 x bfloat> %op } @@ -20869,6 +22232,14 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_minnum_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_min_num_bf16 v0, v0, v2 +; GFX1250-NEXT: v_pk_min_num_bf16 v1, v1, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) ret <4 x bfloat> %op } @@ -21407,6 +22778,16 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_minnum_v8bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_min_num_bf16 v0, v0, v4 +; GFX1250-NEXT: v_pk_min_num_bf16 v1, v1, v5 +; GFX1250-NEXT: v_pk_min_num_bf16 v2, v2, v6 +; GFX1250-NEXT: v_pk_min_num_bf16 v3, v3, v7 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) ret <8 x bfloat> %op } @@ -22437,6 +23818,20 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v4, v4, v13, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_minnum_v16bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_min_num_bf16 v0, v0, v8 +; GFX1250-NEXT: v_pk_min_num_bf16 v1, v1, v9 +; GFX1250-NEXT: v_pk_min_num_bf16 v2, v2, v10 +; GFX1250-NEXT: v_pk_min_num_bf16 v3, v3, v11 +; GFX1250-NEXT: v_pk_min_num_bf16 v4, v4, v12 +; GFX1250-NEXT: v_pk_min_num_bf16 v5, v5, v13 +; GFX1250-NEXT: v_pk_min_num_bf16 v6, v6, v14 +; GFX1250-NEXT: v_pk_min_num_bf16 v7, v7, v15 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) ret <16 x bfloat> %op } @@ -24529,6 +25924,30 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_minnum_v32bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_b32 v31, off, s32 +; GFX1250-NEXT: v_pk_min_num_bf16 v0, v0, v16 +; GFX1250-NEXT: v_pk_min_num_bf16 v1, v1, v17 +; GFX1250-NEXT: v_pk_min_num_bf16 v2, v2, v18 +; GFX1250-NEXT: v_pk_min_num_bf16 v3, v3, v19 +; GFX1250-NEXT: v_pk_min_num_bf16 v4, v4, v20 +; GFX1250-NEXT: v_pk_min_num_bf16 v5, v5, v21 +; GFX1250-NEXT: v_pk_min_num_bf16 v6, v6, v22 +; GFX1250-NEXT: v_pk_min_num_bf16 v7, v7, v23 +; GFX1250-NEXT: v_pk_min_num_bf16 v8, v8, v24 +; GFX1250-NEXT: v_pk_min_num_bf16 v9, v9, v25 +; GFX1250-NEXT: v_pk_min_num_bf16 v10, v10, v26 +; GFX1250-NEXT: v_pk_min_num_bf16 v11, v11, v27 +; GFX1250-NEXT: v_pk_min_num_bf16 v12, v12, v28 +; GFX1250-NEXT: v_pk_min_num_bf16 v13, v13, v29 +; GFX1250-NEXT: v_pk_min_num_bf16 v14, v14, v30 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_pk_min_num_bf16 v15, v15, v31 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) ret <32 x bfloat> %op } @@ -24653,6 +26072,16 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_maxnum_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b) ret bfloat %op } @@ -24826,6 +26255,13 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_maxnum_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) ret <2 x bfloat> %op } @@ -25060,6 +26496,14 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_maxnum_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v2 +; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) ret <3 x bfloat> %op } @@ -25350,6 +26794,14 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_maxnum_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v2 +; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) ret <4 x bfloat> %op } @@ -25888,6 +27340,16 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_maxnum_v8bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v4 +; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v5 +; GFX1250-NEXT: v_pk_max_num_bf16 v2, v2, v6 +; GFX1250-NEXT: v_pk_max_num_bf16 v3, v3, v7 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) ret <8 x bfloat> %op } @@ -26918,6 +28380,20 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v4, v4, v13, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_maxnum_v16bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v8 +; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v9 +; GFX1250-NEXT: v_pk_max_num_bf16 v2, v2, v10 +; GFX1250-NEXT: v_pk_max_num_bf16 v3, v3, v11 +; GFX1250-NEXT: v_pk_max_num_bf16 v4, v4, v12 +; GFX1250-NEXT: v_pk_max_num_bf16 v5, v5, v13 +; GFX1250-NEXT: v_pk_max_num_bf16 v6, v6, v14 +; GFX1250-NEXT: v_pk_max_num_bf16 v7, v7, v15 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) ret <16 x bfloat> %op } @@ -29010,6 +30486,30 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_maxnum_v32bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_b32 v31, off, s32 +; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v16 +; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v17 +; GFX1250-NEXT: v_pk_max_num_bf16 v2, v2, v18 +; GFX1250-NEXT: v_pk_max_num_bf16 v3, v3, v19 +; GFX1250-NEXT: v_pk_max_num_bf16 v4, v4, v20 +; GFX1250-NEXT: v_pk_max_num_bf16 v5, v5, v21 +; GFX1250-NEXT: v_pk_max_num_bf16 v6, v6, v22 +; GFX1250-NEXT: v_pk_max_num_bf16 v7, v7, v23 +; GFX1250-NEXT: v_pk_max_num_bf16 v8, v8, v24 +; GFX1250-NEXT: v_pk_max_num_bf16 v9, v9, v25 +; GFX1250-NEXT: v_pk_max_num_bf16 v10, v10, v26 +; GFX1250-NEXT: v_pk_max_num_bf16 v11, v11, v27 +; GFX1250-NEXT: v_pk_max_num_bf16 v12, v12, v28 +; GFX1250-NEXT: v_pk_max_num_bf16 v13, v13, v29 +; GFX1250-NEXT: v_pk_max_num_bf16 v14, v14, v30 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_pk_max_num_bf16 v15, v15, v31 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) ret <32 x bfloat> %op } @@ -29263,6 +30763,13 @@ define bfloat @v_sqrt_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_sqrt_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.sqrt.bf16(bfloat %a) ret bfloat %op } @@ -29369,6 +30876,16 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_ldexp_bf16_i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.ldexp.bf16.i32(bfloat %a, i32 %b) ret bfloat %op } @@ -29487,6 +31004,17 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_frexp_bf16_i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_frexp_mant_f32_e32 v0, v1 +; GFX1250-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat %a) ret { bfloat, i16 } %op } @@ -29725,6 +31253,32 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_log_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1250-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo +; GFX1250-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1250-NEXT: v_log_f32_e32 v0, v0 +; GFX1250-NEXT: v_nop +; GFX1250-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 +; GFX1250-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 +; GFX1250-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 +; GFX1250-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.log.bf16(bfloat %a) ret bfloat %op } @@ -29884,6 +31438,13 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_log2_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_log_bf16_e32 v0, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.log2.bf16(bfloat %a) ret bfloat %op } @@ -30117,6 +31678,32 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_log10_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1250-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo +; GFX1250-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1250-NEXT: v_log_f32_e32 v0, v0 +; GFX1250-NEXT: v_nop +; GFX1250-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 +; GFX1250-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 +; GFX1250-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 +; GFX1250-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.log10.bf16(bfloat %a) ret bfloat %op } @@ -30358,6 +31945,35 @@ define bfloat @v_exp_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_exp_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX1250-NEXT: s_mov_b32 s0, 0x3fb8aa3b +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v1 +; GFX1250-NEXT: v_rndne_f32_e32 v3, v2 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v4, v0, s0, -v2 op_sel_hi:[1,0,0] +; GFX1250-NEXT: s_mov_b32 s0, 0x32a5705f +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, s0, v4 op_sel_hi:[1,0,0] +; GFX1250-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX1250-NEXT: v_cvt_i32_f32_e32 v2, v3 +; GFX1250-NEXT: v_exp_f32_e32 v0, v0 +; GFX1250-NEXT: v_nop +; GFX1250-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1250-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo +; GFX1250-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v0, vcc_lo +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.exp.bf16(bfloat %a) ret bfloat %op } @@ -30521,6 +32137,13 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_exp2_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_exp_bf16_e32 v0, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.exp2.bf16(bfloat %a) ret bfloat %op } @@ -30758,6 +32381,35 @@ define bfloat @v_exp10_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_exp10_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX1250-NEXT: s_mov_b32 s0, 0x40549a78 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mul_f32_e32 v2, 0x40549a78, v1 +; GFX1250-NEXT: v_rndne_f32_e32 v3, v2 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v4, v0, s0, -v2 op_sel_hi:[1,0,0] +; GFX1250-NEXT: s_mov_b32 s0, 0x33979a37 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, s0, v4 op_sel_hi:[1,0,0] +; GFX1250-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX1250-NEXT: v_cvt_i32_f32_e32 v2, v3 +; GFX1250-NEXT: v_exp_f32_e32 v0, v0 +; GFX1250-NEXT: v_nop +; GFX1250-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1250-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo +; GFX1250-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v0, vcc_lo +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.exp10.bf16(bfloat %a) ret bfloat %op } @@ -30864,6 +32516,16 @@ define bfloat @v_ceil_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_ceil_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_ceil_f32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.ceil.bf16(bfloat %a) ret bfloat %op } @@ -30970,6 +32632,16 @@ define bfloat @v_trunc_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_trunc_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_trunc_f32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.trunc.bf16(bfloat %a) ret bfloat %op } @@ -31076,6 +32748,16 @@ define bfloat @v_rint_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_rint_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_rndne_f32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.rint.bf16(bfloat %a) ret bfloat %op } @@ -31182,6 +32864,16 @@ define bfloat @v_nearbyint_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_nearbyint_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_rndne_f32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.nearbyint.bf16(bfloat %a) ret bfloat %op } @@ -31338,6 +33030,24 @@ define bfloat @v_round_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_round_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_trunc_f32_e32 v1, v0 +; GFX1250-NEXT: v_sub_f32_e32 v2, v0, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_ge_f32_e64 s0, |v2|, 0.5 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_bfi_b32 v0, 0x7fffffff, v2, v0 +; GFX1250-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.round.bf16(bfloat %a) ret bfloat %op } @@ -31444,6 +33154,16 @@ define bfloat @v_roundeven_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_roundeven_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_rndne_f32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.roundeven.bf16(bfloat %a) ret bfloat %op } @@ -31550,6 +33270,16 @@ define bfloat @v_floor_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_floor_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_floor_f32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.floor.bf16(bfloat %a) ret bfloat %op } @@ -31654,6 +33384,16 @@ define bfloat @v_canonicalize_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_canonicalize_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.canonicalize.bf16(bfloat %a) ret bfloat %op } @@ -31702,6 +33442,13 @@ define i1 @v_fcmp_false_bf16(bfloat %a, bfloat %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_false_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp false bfloat %a, %b ret i1 %op } @@ -31787,6 +33534,16 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1 ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_oeq_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp oeq bfloat %a, %b ret i1 %op } @@ -31872,6 +33629,16 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1 ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_ogt_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp ogt bfloat %a, %b ret i1 %op } @@ -31957,6 +33724,16 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v1 ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_oge_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp oge bfloat %a, %b ret i1 %op } @@ -32042,6 +33819,16 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v1 ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_olt_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp olt bfloat %a, %b ret i1 %op } @@ -32127,6 +33914,16 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v1 ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_ole_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp ole bfloat %a, %b ret i1 %op } @@ -32212,6 +34009,16 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_lg_f32_e32 vcc_lo, v0, v1 ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_one_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_lg_f32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp one bfloat %a, %b ret i1 %op } @@ -32297,6 +34104,16 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v1 ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_uno_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp uno bfloat %a, %b ret i1 %op } @@ -32382,6 +34199,16 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_nlg_f32_e32 vcc_lo, v0, v1 ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_ueq_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_nlg_f32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp ueq bfloat %a, %b ret i1 %op } @@ -32467,6 +34294,16 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_nle_f32_e32 vcc_lo, v0, v1 ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_ugt_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_nle_f32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp ugt bfloat %a, %b ret i1 %op } @@ -32552,6 +34389,16 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1 ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_uge_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp uge bfloat %a, %b ret i1 %op } @@ -32637,6 +34484,16 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v1 ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_ult_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp ult bfloat %a, %b ret i1 %op } @@ -32722,6 +34579,16 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1 ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_ule_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp ule bfloat %a, %b ret i1 %op } @@ -32807,6 +34674,16 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_neq_f32_e32 vcc_lo, v0, v1 ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_une_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_neq_f32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp une bfloat %a, %b ret i1 %op } @@ -32847,6 +34724,13 @@ define i1 @v_fcmp_true_bf16(bfloat %a, bfloat %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, 1 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_true_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, 1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp true bfloat %a, %b ret i1 %op } @@ -32905,6 +34789,15 @@ define i16 @v_fptosi_bf16_to_i16(bfloat %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fptosi_bf16_to_i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fptosi bfloat %x to i16 ret i16 %op } @@ -33005,6 +34898,19 @@ define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fptosi_v2bf16_to_v2i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX1250-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fptosi <2 x bfloat> %x to <2 x i16> ret <2 x i16> %op } @@ -33125,6 +35031,20 @@ define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) { ; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fptosi_v3bf16_to_v3i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX1250-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX1250-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fptosi <3 x bfloat> %x to <3 x i16> ret <3 x i16> %op } @@ -33277,6 +35197,24 @@ define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) { ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 ; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fptosi_v4bf16_to_v4i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v1 :: v_dual_lshlrev_b32 v3, 16, v0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX1250-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v1, v1, v2, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fptosi <4 x bfloat> %x to <4 x i16> ret <4 x i16> %op } @@ -33335,6 +35273,15 @@ define i32 @v_fptosi_bf16_to_i32(bfloat %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fptosi_bf16_to_i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fptosi bfloat %x to i32 ret i32 %op } @@ -33400,6 +35347,17 @@ define <2 x i32> @v_fptosi_v2bf16_to_v2i32(<2 x bfloat> %x) { ; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v1 ; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fptosi_v2bf16_to_v2i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_i32_f32_e32 v0, v1 +; GFX1250-NEXT: v_cvt_i32_f32_e32 v1, v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fptosi <2 x bfloat> %x to <2 x i32> ret <2 x i32> %op } @@ -33482,6 +35440,19 @@ define <3 x i32> @v_fptosi_v3bf16_to_v3i32(<3 x bfloat> %x) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fptosi_v3bf16_to_v3i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v4, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_i32_f32_e32 v0, v2 +; GFX1250-NEXT: v_cvt_i32_f32_e32 v2, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_i32_f32_e32 v1, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fptosi <3 x bfloat> %x to <3 x i32> ret <3 x i32> %op } @@ -33578,6 +35549,21 @@ define <4 x i32> @v_fptosi_v4bf16_to_v4i32(<4 x bfloat> %x) { ; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v4 ; GFX11-NEXT: v_cvt_i32_f32_e32 v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fptosi_v4bf16_to_v4i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v4, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_i32_f32_e32 v0, v2 +; GFX1250-NEXT: v_cvt_i32_f32_e32 v2, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_i32_f32_e32 v1, v3 +; GFX1250-NEXT: v_cvt_i32_f32_e32 v3, v5 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fptosi <4 x bfloat> %x to <4 x i32> ret <4 x i32> %op } @@ -33742,6 +35728,27 @@ define i64 @v_fptosi_bf16_to_i64(bfloat %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fptosi_bf16_to_i64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_trunc_f32_e32 v0, v0 +; GFX1250-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v0| +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_floor_f32_e32 v1, v1 +; GFX1250-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v0| +; GFX1250-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GFX1250-NEXT: v_cvt_u32_f32_e32 v3, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX1250-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_bitop2_b32 v3, v3, v0 bitop3:0x14 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_xor_b32_e32 v2, v2, v0 +; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], v[2:3], v[0:1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fptosi bfloat %x to i64 ret i64 %op } @@ -33973,6 +35980,42 @@ define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) { ; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v4, v6 ; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v3, v6, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fptosi_v2bf16_to_v2i64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_trunc_f32_e32 v3, v0 +; GFX1250-NEXT: v_mul_f32_e64 v2, 0x2f800000, |v3| +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_floor_f32_e32 v5, v2 +; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 31, v3 +; GFX1250-NEXT: v_trunc_f32_e32 v1, v1 +; GFX1250-NEXT: v_fma_f32 v3, 0xcf800000, v5, |v3| +; GFX1250-NEXT: v_cvt_u32_f32_e32 v7, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_mul_f32_e64 v0, 0x2f800000, |v1| +; GFX1250-NEXT: v_cvt_u32_f32_e32 v8, v3 +; GFX1250-NEXT: v_mov_b32_e32 v3, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_floor_f32_e32 v4, v0 +; GFX1250-NEXT: v_dual_ashrrev_i32 v0, 31, v1 :: v_dual_bitop2_b32 v7, v7, v2 bitop3:0x14 +; GFX1250-NEXT: v_fma_f32 v6, 0xcf800000, v4, |v1| +; GFX1250-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_xor_b32_e32 v5, v4, v0 +; GFX1250-NEXT: v_xor_b32_e32 v4, v6, v0 +; GFX1250-NEXT: v_xor_b32_e32 v6, v8, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], v[4:5], v[0:1] +; GFX1250-NEXT: v_sub_nc_u64_e32 v[2:3], v[6:7], v[2:3] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fptosi <2 x bfloat> %x to <2 x i64> ret <2 x i64> %op } @@ -34293,6 +36336,52 @@ define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, null, v6, v8, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fptosi_v3bf16_to_v3i64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_trunc_f32_e32 v6, v2 +; GFX1250-NEXT: v_trunc_f32_e32 v8, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_trunc_f32_e32 v7, v0 +; GFX1250-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v6| +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_mul_f32_e64 v5, 0x2f800000, |v8| +; GFX1250-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v7| +; GFX1250-NEXT: v_dual_ashrrev_i32 v0, 31, v6 :: v_dual_ashrrev_i32 v2, 31, v7 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_floor_f32_e32 v9, v1 +; GFX1250-NEXT: v_floor_f32_e32 v11, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_floor_f32_e32 v10, v3 +; GFX1250-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_ashrrev_i32 v4, 31, v8 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_fma_f32 v6, 0xcf800000, v9, |v6| +; GFX1250-NEXT: v_fma_f32 v8, 0xcf800000, v11, |v8| +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1250-NEXT: v_fma_f32 v7, 0xcf800000, v10, |v7| +; GFX1250-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GFX1250-NEXT: v_cvt_u32_f32_e32 v10, v10 +; GFX1250-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX1250-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GFX1250-NEXT: v_cvt_u32_f32_e32 v12, v7 +; GFX1250-NEXT: v_cvt_u32_f32_e32 v13, v8 +; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v5, v4 +; GFX1250-NEXT: v_xor_b32_e32 v7, v9, v0 +; GFX1250-NEXT: v_xor_b32_e32 v6, v6, v0 +; GFX1250-NEXT: v_xor_b32_e32 v9, v10, v2 +; GFX1250-NEXT: v_xor_b32_e32 v8, v12, v2 +; GFX1250-NEXT: v_xor_b32_e32 v11, v11, v4 +; GFX1250-NEXT: v_xor_b32_e32 v10, v13, v4 +; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], v[6:7], v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_sub_nc_u64_e32 v[2:3], v[8:9], v[2:3] +; GFX1250-NEXT: v_sub_nc_u64_e32 v[4:5], v[10:11], v[4:5] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fptosi <3 x bfloat> %x to <3 x i64> ret <3 x i64> %op } @@ -34698,6 +36787,61 @@ define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_co_ci_u32_e64 v7, null, v7, v13, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fptosi_v4bf16_to_v4i64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v3, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_trunc_f32_e32 v7, v2 +; GFX1250-NEXT: v_trunc_f32_e32 v9, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_trunc_f32_e32 v8, v0 +; GFX1250-NEXT: v_trunc_f32_e32 v10, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v7| +; GFX1250-NEXT: v_mul_f32_e64 v5, 0x2f800000, |v9| +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v8| +; GFX1250-NEXT: v_mul_f32_e64 v11, 0x2f800000, |v10| +; GFX1250-NEXT: v_dual_ashrrev_i32 v0, 31, v7 :: v_dual_ashrrev_i32 v2, 31, v8 +; GFX1250-NEXT: v_floor_f32_e32 v12, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1250-NEXT: v_floor_f32_e32 v13, v3 +; GFX1250-NEXT: v_floor_f32_e32 v14, v5 +; GFX1250-NEXT: v_floor_f32_e32 v11, v11 +; GFX1250-NEXT: v_dual_ashrrev_i32 v4, 31, v9 :: v_dual_ashrrev_i32 v6, 31, v10 +; GFX1250-NEXT: v_fma_f32 v7, 0xcf800000, v12, |v7| +; GFX1250-NEXT: v_cvt_u32_f32_e32 v12, v12 +; GFX1250-NEXT: v_fma_f32 v8, 0xcf800000, v13, |v8| +; GFX1250-NEXT: v_fma_f32 v15, 0xcf800000, v14, |v9| +; GFX1250-NEXT: v_fma_f32 v16, 0xcf800000, v11, |v10| +; GFX1250-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX1250-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v3, v2 +; GFX1250-NEXT: v_mov_b32_e32 v5, v4 +; GFX1250-NEXT: v_cvt_u32_f32_e32 v13, v13 +; GFX1250-NEXT: v_cvt_u32_f32_e32 v14, v14 +; GFX1250-NEXT: v_xor_b32_e32 v9, v12, v0 +; GFX1250-NEXT: v_cvt_u32_f32_e32 v10, v8 +; GFX1250-NEXT: v_xor_b32_e32 v8, v7, v0 +; GFX1250-NEXT: v_cvt_u32_f32_e32 v12, v11 +; GFX1250-NEXT: v_cvt_u32_f32_e32 v7, v15 +; GFX1250-NEXT: v_cvt_u32_f32_e32 v15, v16 +; GFX1250-NEXT: v_xor_b32_e32 v11, v13, v2 +; GFX1250-NEXT: v_xor_b32_e32 v10, v10, v2 +; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], v[8:9], v[0:1] +; GFX1250-NEXT: v_xor_b32_e32 v9, v14, v4 +; GFX1250-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_bitop2_b32 v8, v7, v4 bitop3:0x14 +; GFX1250-NEXT: v_xor_b32_e32 v13, v12, v6 +; GFX1250-NEXT: v_xor_b32_e32 v12, v15, v6 +; GFX1250-NEXT: v_sub_nc_u64_e32 v[2:3], v[10:11], v[2:3] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_sub_nc_u64_e32 v[4:5], v[8:9], v[4:5] +; GFX1250-NEXT: v_sub_nc_u64_e32 v[6:7], v[12:13], v[6:7] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fptosi <4 x bfloat> %x to <4 x i64> ret <4 x i64> %op } @@ -34795,6 +36939,16 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_sitofp_i16_to_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = sitofp i16 %x to bfloat ret bfloat %op } @@ -34936,6 +37090,19 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_sitofp_v2i16_to_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_ashrrev_i32_e32 v1, 16, v0 +; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = sitofp <2 x i16> %x to <2 x bfloat> ret <2 x bfloat> %op } @@ -35125,6 +37292,23 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_sitofp_v3i16_to_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 16, v0 +; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = sitofp <3 x i16> %x to <3 x bfloat> ret <3 x bfloat> %op } @@ -35355,6 +37539,24 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_sitofp_v4i16_to_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_ashrrev_i32 v2, 16, v1 :: v_dual_ashrrev_i32 v3, 16, v0 +; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v3, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = sitofp <4 x i16> %x to <4 x bfloat> ret <4 x bfloat> %op } @@ -35446,6 +37648,15 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_sitofp_i32_to_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = sitofp i32 %x to bfloat ret bfloat %op } @@ -35577,6 +37788,16 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_sitofp_v2i32_to_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = sitofp <2 x i32> %x to <2 x bfloat> ret <2 x bfloat> %op } @@ -35750,6 +37971,18 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v2, 16 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_sitofp_v3i32_to_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = sitofp <3 x i32> %x to <3 x bfloat> ret <3 x bfloat> %op } @@ -35959,6 +38192,19 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc_lo ; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_sitofp_v4i32_to_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v3, v3 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = sitofp <4 x i32> %x to <4 x bfloat> ret <4 x bfloat> %op } @@ -36148,6 +38394,28 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_sitofp_i64_to_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_xor_b32_e32 v2, v0, v1 +; GFX1250-NEXT: v_cls_i32_e32 v3, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; GFX1250-NEXT: v_add_nc_u32_e32 v2, 32, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_add_min_u32_e64 v2, v3, -1, v2 +; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v2, v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX1250-NEXT: v_dual_sub_nc_u32 v1, 32, v2 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX1250-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = sitofp i64 %x to bfloat ret bfloat %op } @@ -36474,6 +38742,40 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_sitofp_v2i64_to_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_xor_b32_e32 v5, v0, v1 +; GFX1250-NEXT: v_xor_b32_e32 v4, v2, v3 +; GFX1250-NEXT: v_cls_i32_e32 v6, v3 +; GFX1250-NEXT: v_cls_i32_e32 v7, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_ashrrev_i32 v5, 31, v5 :: v_dual_ashrrev_i32 v4, 31, v4 +; GFX1250-NEXT: v_dual_add_nc_u32 v5, 32, v5 :: v_dual_add_nc_u32 v4, 32, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_add_min_u32_e64 v5, v7, -1, v5 +; GFX1250-NEXT: v_add_min_u32_e64 v4, v6, -1, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v5, v[0:1] +; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v4, v[2:3] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX1250-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX1250-NEXT: v_dual_sub_nc_u32 v1, 32, v4 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54 +; GFX1250-NEXT: v_sub_nc_u32_e32 v3, 32, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_ldexp_f32 v0, v0, v3 +; GFX1250-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = sitofp <2 x i64> %x to <2 x bfloat> ret <2 x bfloat> %op } @@ -36929,6 +39231,53 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_sitofp_v3i64_to_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_xor_b32_e32 v8, v4, v5 +; GFX1250-NEXT: v_xor_b32_e32 v6, v2, v3 +; GFX1250-NEXT: v_cls_i32_e32 v10, v3 +; GFX1250-NEXT: v_cls_i32_e32 v9, v5 +; GFX1250-NEXT: v_cls_i32_e32 v11, v1 +; GFX1250-NEXT: v_dual_ashrrev_i32 v8, 31, v8 :: v_dual_bitop2_b32 v7, v0, v1 bitop3:0x14 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_ashrrev_i32 v6, 31, v6 :: v_dual_ashrrev_i32 v7, 31, v7 +; GFX1250-NEXT: v_dual_add_nc_u32 v6, 32, v6 :: v_dual_add_nc_u32 v7, 32, v7 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_add_min_u32_e64 v6, v10, -1, v6 +; GFX1250-NEXT: v_add_min_u32_e64 v7, v11, -1, v7 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v6, v[2:3] +; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v7, v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX1250-NEXT: v_add_nc_u32_e32 v8, 32, v8 +; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX1250-NEXT: v_add_min_u32_e64 v8, v9, -1, v8 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_lshlrev_b64_e32 v[4:5], v8, v[4:5] +; GFX1250-NEXT: v_sub_nc_u32_e32 v8, 32, v8 +; GFX1250-NEXT: v_ldexp_f32 v2, v2, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX1250-NEXT: v_dual_sub_nc_u32 v4, 32, v7 :: v_dual_bitop2_b32 v1, v5, v4 bitop3:0x54 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX1250-NEXT: v_ldexp_f32 v0, v0, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_ldexp_f32 v1, v1, v8 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = sitofp <3 x i64> %x to <3 x bfloat> ret <3 x bfloat> %op } @@ -37509,6 +39858,64 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc_lo ; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_sitofp_v4i64_to_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_xor_b32_e32 v9, v4, v5 +; GFX1250-NEXT: v_xor_b32_e32 v8, v6, v7 +; GFX1250-NEXT: v_cls_i32_e32 v12, v7 +; GFX1250-NEXT: v_cls_i32_e32 v13, v5 +; GFX1250-NEXT: v_cls_i32_e32 v14, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_dual_ashrrev_i32 v9, 31, v9 :: v_dual_ashrrev_i32 v8, 31, v8 +; GFX1250-NEXT: v_xor_b32_e32 v10, v2, v3 +; GFX1250-NEXT: v_cls_i32_e32 v15, v1 +; GFX1250-NEXT: v_dual_add_nc_u32 v9, 32, v9 :: v_dual_add_nc_u32 v8, 32, v8 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_dual_ashrrev_i32 v10, 31, v10 :: v_dual_bitop2_b32 v11, v0, v1 bitop3:0x14 +; GFX1250-NEXT: v_add_min_u32_e64 v9, v13, -1, v9 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_add_min_u32_e64 v8, v12, -1, v8 +; GFX1250-NEXT: v_dual_ashrrev_i32 v11, 31, v11 :: v_dual_add_nc_u32 v10, 32, v10 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_lshlrev_b64_e32 v[4:5], v9, v[4:5] +; GFX1250-NEXT: v_lshlrev_b64_e32 v[6:7], v8, v[6:7] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_add_nc_u32_e32 v11, 32, v11 +; GFX1250-NEXT: v_add_min_u32_e64 v10, v14, -1, v10 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_add_min_u32_e64 v11, v15, -1, v11 +; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v10, v[2:3] +; GFX1250-NEXT: v_min_u32_e32 v6, 1, v6 +; GFX1250-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v11, v[0:1] +; GFX1250-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX1250-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_dual_sub_nc_u32 v5, 32, v10 :: v_dual_bitop2_b32 v4, v5, v4 bitop3:0x54 +; GFX1250-NEXT: v_sub_nc_u32_e32 v7, 32, v9 +; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX1250-NEXT: v_dual_sub_nc_u32 v3, 32, v8 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX1250-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v1, v6 +; GFX1250-NEXT: v_sub_nc_u32_e32 v6, 32, v11 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX1250-NEXT: v_ldexp_f32 v1, v1, v3 +; GFX1250-NEXT: v_ldexp_f32 v3, v4, v7 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_ldexp_f32 v2, v2, v5 +; GFX1250-NEXT: v_ldexp_f32 v0, v0, v6 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v3, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = sitofp <4 x i64> %x to <4 x bfloat> ret <4 x bfloat> %op } @@ -37607,6 +40014,16 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_uitofp_i16_to_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = uitofp i16 %x to bfloat ret bfloat %op } @@ -37749,6 +40166,19 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_uitofp_v2i16_to_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = uitofp <2 x i16> %x to <2 x bfloat> ret <2 x bfloat> %op } @@ -37942,6 +40372,23 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_uitofp_v3i16_to_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = uitofp <3 x i16> %x to <3 x bfloat> ret <3 x bfloat> %op } @@ -38178,6 +40625,24 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo ; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_uitofp_v4i16_to_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshrrev_b32 v2, 16, v1 :: v_dual_lshrrev_b32 v3, 16, v0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v3, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = uitofp <4 x i16> %x to <4 x bfloat> ret <4 x bfloat> %op } @@ -38269,6 +40734,15 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_uitofp_i32_to_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = uitofp i32 %x to bfloat ret bfloat %op } @@ -38400,6 +40874,16 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_uitofp_v2i32_to_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = uitofp <2 x i32> %x to <2 x bfloat> ret <2 x bfloat> %op } @@ -38573,6 +41057,18 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v2, 16 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_uitofp_v3i32_to_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = uitofp <3 x i32> %x to <3 x bfloat> ret <3 x bfloat> %op } @@ -38782,6 +41278,19 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc_lo ; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_uitofp_v4i32_to_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v3, v3 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = uitofp <4 x i32> %x to <4 x bfloat> ret <4 x bfloat> %op } @@ -38935,6 +41444,24 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_uitofp_i64_to_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_clz_i32_u32_e32 v2, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_min_u32_e32 v2, 32, v2 +; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v2, v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX1250-NEXT: v_dual_sub_nc_u32 v1, 32, v2 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX1250-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = uitofp i64 %x to bfloat ret bfloat %op } @@ -39190,6 +41717,35 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_uitofp_v2i64_to_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_clz_i32_u32_e32 v4, v3 +; GFX1250-NEXT: v_clz_i32_u32_e32 v5, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_min_u32_e32 v4, 32, v4 +; GFX1250-NEXT: v_min_u32_e32 v5, 32, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v4, v[2:3] +; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v5, v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX1250-NEXT: v_dual_sub_nc_u32 v3, 32, v5 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54 +; GFX1250-NEXT: v_sub_nc_u32_e32 v1, 32, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX1250-NEXT: v_ldexp_f32 v0, v0, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = uitofp <2 x i64> %x to <2 x bfloat> ret <2 x bfloat> %op } @@ -39548,6 +42104,45 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_uitofp_v3i64_to_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_clz_i32_u32_e32 v6, v3 +; GFX1250-NEXT: v_clz_i32_u32_e32 v7, v1 +; GFX1250-NEXT: v_clz_i32_u32_e32 v8, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_min_u32_e32 v6, 32, v6 +; GFX1250-NEXT: v_min_u32_e32 v7, 32, v7 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_min_u32_e32 v8, 32, v8 +; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v6, v[2:3] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v7, v[0:1] +; GFX1250-NEXT: v_lshlrev_b64_e32 v[4:5], v8, v[4:5] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX1250-NEXT: v_dual_sub_nc_u32 v8, 32, v8 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54 +; GFX1250-NEXT: v_dual_sub_nc_u32 v4, 32, v7 :: v_dual_bitop2_b32 v1, v5, v4 bitop3:0x54 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX1250-NEXT: v_ldexp_f32 v2, v2, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_ldexp_f32 v0, v0, v4 +; GFX1250-NEXT: v_ldexp_f32 v1, v1, v8 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = uitofp <3 x i64> %x to <3 x bfloat> ret <3 x bfloat> %op } @@ -39996,6 +42591,54 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_uitofp_v4i64_to_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_clz_i32_u32_e32 v8, v7 +; GFX1250-NEXT: v_clz_i32_u32_e32 v9, v3 +; GFX1250-NEXT: v_clz_i32_u32_e32 v10, v1 +; GFX1250-NEXT: v_clz_i32_u32_e32 v11, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_min_u32_e32 v8, 32, v8 +; GFX1250-NEXT: v_min_u32_e32 v9, 32, v9 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_min_u32_e32 v10, 32, v10 +; GFX1250-NEXT: v_min_u32_e32 v11, 32, v11 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_lshlrev_b64_e32 v[6:7], v8, v[6:7] +; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v9, v[2:3] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v10, v[0:1] +; GFX1250-NEXT: v_lshlrev_b64_e32 v[4:5], v11, v[4:5] +; GFX1250-NEXT: v_dual_sub_nc_u32 v8, 32, v8 :: v_dual_sub_nc_u32 v11, 32, v11 +; GFX1250-NEXT: v_min_u32_e32 v6, 1, v6 +; GFX1250-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX1250-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_dual_sub_nc_u32 v9, 32, v9 :: v_dual_bitop2_b32 v6, v7, v6 bitop3:0x54 +; GFX1250-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_dual_sub_nc_u32 v3, 32, v10 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54 +; GFX1250-NEXT: v_or_b32_e32 v1, v5, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_f32_u32_e32 v4, v6 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_ldexp_f32 v4, v4, v8 +; GFX1250-NEXT: v_ldexp_f32 v2, v2, v9 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_ldexp_f32 v0, v0, v3 +; GFX1250-NEXT: v_ldexp_f32 v1, v1, v11 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v4 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = uitofp <4 x i64> %x to <4 x bfloat> ret <4 x bfloat> %op } @@ -40073,6 +42716,16 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_select_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = select i1 %cond, bfloat %a, bfloat %b ret bfloat %op } @@ -40156,6 +42809,17 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_select_fneg_lhs_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1250-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %neg.a = fneg bfloat %a %op = select i1 %cond, bfloat %neg.a, bfloat %b ret bfloat %op @@ -40240,6 +42904,17 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_select_fneg_rhs_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1250-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %neg.b = fneg bfloat %b %op = select i1 %cond, bfloat %a, bfloat %neg.b ret bfloat %op @@ -40349,6 +43024,19 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_select_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshrrev_b32 v3, 16, v1 :: v_dual_bitop2_b32 v0, 1, v0 bitop3:0x40 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_dual_lshrrev_b32 v4, 16, v2 :: v_dual_cndmask_b32 v0, v2, v1, vcc_lo +; GFX1250-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc_lo +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b ret <2 x bfloat> %op } @@ -40466,6 +43154,21 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_vselect_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshrrev_b32 v4, 16, v2 :: v_dual_bitop2_b32 v0, 1, v0 bitop3:0x40 +; GFX1250-NEXT: v_dual_lshrrev_b32 v5, 16, v3 :: v_dual_bitop2_b32 v1, 1, v1 bitop3:0x40 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo +; GFX1250-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b ret <2 x bfloat> %op } @@ -40552,6 +43255,17 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_select_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cndmask_b32_e32 v0, s1, v1, vcc_lo +; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1250-NEXT: ; return to shader part epilog %cond = icmp eq i32 %c, 0 %op = select i1 %cond, bfloat %a, bfloat %b %cast = bitcast bfloat %op to i16 @@ -40687,6 +43401,21 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_select_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_lshr_b32 s2, s0, 16 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1250-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s0 +; GFX1250-NEXT: s_lshr_b32 s3, s1, 16 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_cndmask_b32_e32 v0, s3, v1, vcc_lo +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cndmask_b32_e32 v1, s1, v2, vcc_lo +; GFX1250-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1250-NEXT: ; return to shader part epilog %cond = icmp eq i32 %c, 0 %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b %cast = bitcast <2 x bfloat> %op to i32 @@ -40824,6 +43553,22 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_vselect_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_lshr_b32 s2, s0, 16 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1250-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s0 +; GFX1250-NEXT: s_lshr_b32 s0, s1, 16 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_cndmask_b32_e32 v1, s0, v2, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cndmask_b32_e32 v0, s1, v3, vcc_lo +; GFX1250-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1250-NEXT: ; return to shader part epilog %cond = icmp eq <2 x i32> %c, zeroinitializer %op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b %cast = bitcast <2 x bfloat> %op to i32 @@ -40925,6 +43670,16 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_select_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b ret <3 x bfloat> %op } @@ -41033,6 +43788,16 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_select_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b ret <4 x bfloat> %op } @@ -41168,6 +43933,17 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b) ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v1 :: v_dual_cndmask_b32 v1, v5, v2 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_select_v6bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_dual_cndmask_b32 v0, v4, v1 :: v_dual_cndmask_b32 v1, v5, v2 +; GFX1250-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = select i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b ret <6 x bfloat> %op } @@ -41329,6 +44105,17 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) ; GFX11-NEXT: v_dual_cndmask_b32 v0, v5, v1 :: v_dual_cndmask_b32 v1, v6, v2 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v7, v3 :: v_dual_cndmask_b32 v3, v8, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_select_v8bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_dual_cndmask_b32 v0, v5, v1 :: v_dual_cndmask_b32 v1, v6, v2 +; GFX1250-NEXT: v_dual_cndmask_b32 v2, v7, v3 :: v_dual_cndmask_b32 v3, v8, v4 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = select i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b ret <8 x bfloat> %op } @@ -41604,6 +44391,19 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> ; GFX11-NEXT: v_dual_cndmask_b32 v4, v13, v5 :: v_dual_cndmask_b32 v5, v14, v6 ; GFX11-NEXT: v_dual_cndmask_b32 v6, v15, v7 :: v_dual_cndmask_b32 v7, v16, v8 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_select_v16bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_dual_cndmask_b32 v0, v9, v1 :: v_dual_cndmask_b32 v1, v10, v2 +; GFX1250-NEXT: v_dual_cndmask_b32 v2, v11, v3 :: v_dual_cndmask_b32 v3, v12, v4 +; GFX1250-NEXT: v_dual_cndmask_b32 v4, v13, v5 :: v_dual_cndmask_b32 v5, v14, v6 +; GFX1250-NEXT: v_dual_cndmask_b32 v6, v15, v7 :: v_dual_cndmask_b32 v7, v16, v8 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = select i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b ret <16 x bfloat> %op } @@ -42234,6 +45034,27 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_cndmask_b32 v14, v31, v15 :: v_dual_cndmask_b32 v15, v32, v16 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_select_v32bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: scratch_load_b32 v31, off, s32 +; GFX1250-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX1250-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_dual_cndmask_b32 v0, v17, v1 :: v_dual_cndmask_b32 v1, v18, v2 +; GFX1250-NEXT: v_dual_cndmask_b32 v2, v19, v3 :: v_dual_cndmask_b32 v3, v20, v4 +; GFX1250-NEXT: v_dual_cndmask_b32 v4, v21, v5 :: v_dual_cndmask_b32 v5, v22, v6 +; GFX1250-NEXT: v_dual_cndmask_b32 v6, v23, v7 :: v_dual_cndmask_b32 v7, v24, v8 +; GFX1250-NEXT: v_dual_cndmask_b32 v8, v25, v9 :: v_dual_cndmask_b32 v9, v26, v10 +; GFX1250-NEXT: v_dual_cndmask_b32 v10, v27, v11 :: v_dual_cndmask_b32 v11, v28, v12 +; GFX1250-NEXT: v_dual_cndmask_b32 v12, v29, v13 :: v_dual_cndmask_b32 v13, v30, v14 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_dual_cndmask_b32 v14, v31, v15 :: v_dual_cndmask_b32 v15, v32, v16 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = select i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b ret <32 x bfloat> %op } @@ -42349,6 +45170,20 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readfirstlane_b32 s1, v1 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_select_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cndmask_b32_e32 v0, s2, v1, vcc_lo +; GFX1250-NEXT: v_cndmask_b32_e32 v1, s3, v2, vcc_lo +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1250-NEXT: ; return to shader part epilog %cond = icmp eq i32 %c, 0 %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b %cast = bitcast <3 x bfloat> %op to i48 @@ -42475,6 +45310,18 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> ; GFX11-NEXT: v_readfirstlane_b32 s1, v0 ; GFX11-NEXT: v_readfirstlane_b32 s0, v1 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_select_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s0 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cndmask_b32_e32 v0, s3, v1, vcc_lo +; GFX1250-NEXT: v_cndmask_b32_e32 v1, s2, v2, vcc_lo +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_readfirstlane_b32 s1, v0 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1250-NEXT: ; return to shader part epilog %cond = icmp eq i32 %c, 0 %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b %cast = bitcast <4 x bfloat> %op to <2 x i32> @@ -42709,6 +45556,33 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_readfirstlane_b32 s1, v1 ; GFX11FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_vselect_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_lshr_b32 s4, s1, 16 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1250-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s1 +; GFX1250-NEXT: s_lshr_b32 s4, s3, 16 +; GFX1250-NEXT: s_lshr_b32 s5, s0, 16 +; GFX1250-NEXT: v_mov_b32_e32 v6, s0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-NEXT: v_cndmask_b32_e32 v3, s4, v4, vcc_lo +; GFX1250-NEXT: v_mov_b32_e32 v4, s5 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1250-NEXT: s_lshr_b32 s0, s2, 16 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_cndmask_b32_e32 v1, s0, v4, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1250-NEXT: v_cndmask_b32_e32 v0, s2, v6, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-NEXT: v_cndmask_b32_e32 v2, s3, v5, vcc_lo +; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1250-NEXT: ; return to shader part epilog %cond = icmp eq <4 x i32> %c, zeroinitializer %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b %cast = bitcast <4 x bfloat> %op to <2 x i32> @@ -42912,6 +45786,28 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_vselect_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX1250-NEXT: v_dual_lshrrev_b32 v8, 16, v4 :: v_dual_bitop2_b32 v1, 1, v1 bitop3:0x40 +; GFX1250-NEXT: v_dual_lshrrev_b32 v9, 16, v6 :: v_dual_bitop2_b32 v3, 1, v3 bitop3:0x40 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX1250-NEXT: v_dual_cndmask_b32 v2, v7, v5, vcc_lo :: v_dual_bitop2_b32 v0, 1, v0 bitop3:0x40 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_dual_lshrrev_b32 v7, 16, v7 :: v_dual_lshrrev_b32 v5, 16, v5 +; GFX1250-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX1250-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX1250-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc_lo +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b ret <4 x bfloat> %op } @@ -43264,6 +46160,46 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11FAKE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_vselect_v8bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX1250-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX1250-NEXT: v_dual_lshrrev_b32 v17, 16, v14 :: v_dual_bitop2_b32 v5, 1, v5 bitop3:0x40 +; GFX1250-NEXT: v_dual_lshrrev_b32 v16, 16, v10 :: v_dual_bitop2_b32 v3, 1, v3 bitop3:0x40 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX1250-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX1250-NEXT: v_dual_cndmask_b32 v6, v15, v11, vcc_lo :: v_dual_bitop2_b32 v0, 1, v0 bitop3:0x40 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX1250-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX1250-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX1250-NEXT: v_dual_cndmask_b32 v4, v14, v10 :: v_dual_lshrrev_b32 v15, 16, v15 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 +; GFX1250-NEXT: v_dual_lshrrev_b32 v14, 16, v12 :: v_dual_bitop2_b32 v2, 1, v2 bitop3:0x40 +; GFX1250-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX1250-NEXT: v_cndmask_b32_e32 v5, v17, v16, vcc_lo +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX1250-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX1250-NEXT: v_dual_cndmask_b32 v0, v12, v8 :: v_dual_lshrrev_b32 v13, 16, v13 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX1250-NEXT: v_cndmask_b32_e32 v1, v14, v10, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-NEXT: v_cndmask_b32_e32 v3, v13, v9, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 +; GFX1250-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc_lo +; GFX1250-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1250-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = select <8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b ret <8 x bfloat> %op } @@ -44002,6 +46938,74 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11FAKE16-NEXT: v_perm_b32 v7, v15, v14, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_vselect_v16bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_b32 v31, off, s32 +; GFX1250-NEXT: v_dual_lshrrev_b32 v52, 16, v25 :: v_dual_bitop2_b32 v12, 1, v12 bitop3:0x40 +; GFX1250-NEXT: v_dual_lshrrev_b32 v53, 16, v16 :: v_dual_bitop2_b32 v13, 1, v13 bitop3:0x40 +; GFX1250-NEXT: v_dual_lshrrev_b32 v33, 16, v22 :: v_dual_bitop2_b32 v0, 1, v0 bitop3:0x40 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 +; GFX1250-NEXT: v_dual_lshrrev_b32 v34, 16, v30 :: v_dual_bitop2_b32 v3, 1, v3 bitop3:0x40 +; GFX1250-NEXT: v_dual_lshrrev_b32 v51, 16, v17 :: v_dual_bitop2_b32 v10, 1, v10 bitop3:0x40 +; GFX1250-NEXT: v_cndmask_b32_e32 v12, v30, v22, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13 +; GFX1250-NEXT: v_dual_lshrrev_b32 v50, 16, v26 :: v_dual_bitop2_b32 v11, 1, v11 bitop3:0x40 +; GFX1250-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX1250-NEXT: v_dual_lshrrev_b32 v35, 16, v21 :: v_dual_bitop2_b32 v2, 1, v2 bitop3:0x40 +; GFX1250-NEXT: v_cndmask_b32_e32 v13, v34, v33, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 +; GFX1250-NEXT: v_dual_lshrrev_b32 v36, 16, v29 :: v_dual_bitop2_b32 v4, 1, v4 bitop3:0x40 +; GFX1250-NEXT: v_dual_lshrrev_b32 v49, 16, v18 :: v_dual_bitop2_b32 v8, 1, v8 bitop3:0x40 +; GFX1250-NEXT: v_cndmask_b32_e32 v10, v29, v21, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11 +; GFX1250-NEXT: v_dual_lshrrev_b32 v37, 16, v20 :: v_dual_bitop2_b32 v5, 1, v5 bitop3:0x40 +; GFX1250-NEXT: v_dual_lshrrev_b32 v38, 16, v28 :: v_dual_bitop2_b32 v7, 1, v7 bitop3:0x40 +; GFX1250-NEXT: v_dual_lshrrev_b32 v48, 16, v27 :: v_dual_bitop2_b32 v9, 1, v9 bitop3:0x40 +; GFX1250-NEXT: v_cndmask_b32_e32 v11, v36, v35, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 +; GFX1250-NEXT: v_dual_lshrrev_b32 v39, 16, v19 :: v_dual_bitop2_b32 v6, 1, v6 bitop3:0x40 +; GFX1250-NEXT: v_dual_lshrrev_b32 v32, 16, v23 :: v_dual_bitop2_b32 v1, 1, v1 bitop3:0x40 +; GFX1250-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 +; GFX1250-NEXT: v_dual_lshrrev_b32 v54, 16, v24 :: v_dual_bitop2_b32 v15, 1, v15 bitop3:0x40 +; GFX1250-NEXT: v_cndmask_b32_e32 v9, v38, v37, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX1250-NEXT: v_cndmask_b32_e32 v6, v27, v19, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX1250-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX1250-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX1250-NEXT: v_cndmask_b32_e32 v3, v52, v51, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX1250-NEXT: v_cndmask_b32_e32 v1, v54, v53, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 +; GFX1250-NEXT: v_cndmask_b32_e32 v5, v50, v49, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v4, v9, v8, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v5, v11, v10, 0x5040100 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshrrev_b32_e32 v3, 16, v31 +; GFX1250-NEXT: v_cndmask_b32_e32 v7, v48, v39, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14 +; GFX1250-NEXT: v_cndmask_b32_e32 v14, v31, v23, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 +; GFX1250-NEXT: v_cndmask_b32_e32 v15, v3, v32, vcc_lo +; GFX1250-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v6, v13, v12, 0x5040100 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1250-NEXT: v_perm_b32 v7, v15, v14, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = select <16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b ret <16 x bfloat> %op } @@ -45856,6 +48860,178 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX11FAKE16-NEXT: v_perm_b32 v14, v29, v28, 0x5040100 ; GFX11FAKE16-NEXT: v_perm_b32 v15, v31, v30, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_vselect_v32bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x1b +; GFX1250-NEXT: scratch_load_b32 v31, off, s32 offset:60 +; GFX1250-NEXT: scratch_load_b32 v32, off, s32 offset:124 +; GFX1250-NEXT: scratch_load_u16 v33, off, s32 +; GFX1250-NEXT: scratch_load_b32 v34, off, s32 offset:128 +; GFX1250-NEXT: scratch_load_b32 v35, off, s32 offset:64 +; GFX1250-NEXT: scratch_load_b32 v36, off, s32 offset:120 +; GFX1250-NEXT: scratch_load_b32 v37, off, s32 offset:56 +; GFX1250-NEXT: scratch_load_b32 v38, off, s32 offset:116 +; GFX1250-NEXT: scratch_load_b32 v39, off, s32 offset:52 +; GFX1250-NEXT: scratch_load_b32 v48, off, s32 offset:112 +; GFX1250-NEXT: scratch_load_b32 v49, off, s32 offset:48 +; GFX1250-NEXT: scratch_load_b32 v50, off, s32 offset:108 +; GFX1250-NEXT: scratch_load_b32 v51, off, s32 offset:44 +; GFX1250-NEXT: scratch_load_b32 v52, off, s32 offset:104 +; GFX1250-NEXT: scratch_load_b32 v53, off, s32 offset:40 +; GFX1250-NEXT: scratch_load_b32 v54, off, s32 offset:100 +; GFX1250-NEXT: scratch_load_b32 v55, off, s32 offset:36 +; GFX1250-NEXT: scratch_load_b32 v64, off, s32 offset:76 +; GFX1250-NEXT: scratch_load_b32 v65, off, s32 offset:12 +; GFX1250-NEXT: scratch_load_b32 v66, off, s32 offset:96 +; GFX1250-NEXT: scratch_load_b32 v67, off, s32 offset:32 +; GFX1250-NEXT: scratch_load_b32 v68, off, s32 offset:80 +; GFX1250-NEXT: scratch_load_b32 v69, off, s32 offset:84 +; GFX1250-NEXT: scratch_load_b32 v70, off, s32 offset:92 +; GFX1250-NEXT: scratch_load_b32 v71, off, s32 offset:28 +; GFX1250-NEXT: scratch_load_b32 v80, off, s32 offset:20 +; GFX1250-NEXT: scratch_load_b32 v81, off, s32 offset:88 +; GFX1250-NEXT: scratch_load_b32 v82, off, s32 offset:24 +; GFX1250-NEXT: v_and_b32_e32 v30, 1, v30 +; GFX1250-NEXT: v_and_b32_e32 v29, 1, v29 +; GFX1250-NEXT: v_and_b32_e32 v26, 1, v26 +; GFX1250-NEXT: v_and_b32_e32 v24, 1, v24 +; GFX1250-NEXT: v_and_b32_e32 v22, 1, v22 +; GFX1250-NEXT: v_and_b32_e32 v20, 1, v20 +; GFX1250-NEXT: v_and_b32_e32 v18, 1, v18 +; GFX1250-NEXT: v_and_b32_e32 v16, 1, v16 +; GFX1250-NEXT: v_and_b32_e32 v10, 1, v10 +; GFX1250-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX1250-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX1250-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX1250-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX1250-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX1250-NEXT: v_and_b32_e32 v23, 1, v23 +; GFX1250-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX1250-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX1250-NEXT: v_and_b32_e32 v15, 1, v15 +; GFX1250-NEXT: v_and_b32_e32 v21, 1, v21 +; GFX1250-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX1250-NEXT: v_and_b32_e32 v19, 1, v19 +; GFX1250-NEXT: s_wait_loadcnt 0x1a +; GFX1250-NEXT: v_dual_lshrrev_b32 v83, 16, v32 :: v_dual_bitop2_b32 v17, 1, v17 bitop3:0x40 +; GFX1250-NEXT: v_cmp_eq_u32_e64 s1, 1, v30 +; GFX1250-NEXT: v_and_b32_e32 v28, 1, v28 +; GFX1250-NEXT: s_wait_loadcnt 0x17 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_dual_cndmask_b32 v30, v34, v35, s1 :: v_dual_bitop2_b32 v33, 1, v33 bitop3:0x40 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v28 +; GFX1250-NEXT: v_lshrrev_b32_e32 v28, 16, v31 +; GFX1250-NEXT: v_cmp_eq_u32_e64 s0, 1, v29 +; GFX1250-NEXT: scratch_load_b32 v29, off, s32 offset:16 +; GFX1250-NEXT: v_dual_lshrrev_b32 v35, 16, v35 :: v_dual_lshrrev_b32 v34, 16, v34 +; GFX1250-NEXT: v_cndmask_b32_e32 v31, v32, v31, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v33 +; GFX1250-NEXT: scratch_load_b32 v32, off, s32 offset:72 +; GFX1250-NEXT: v_cndmask_b32_e64 v28, v83, v28, s0 +; GFX1250-NEXT: scratch_load_b32 v83, off, s32 offset:4 +; GFX1250-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc_lo +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: scratch_load_b32 v35, off, s32 offset:68 +; GFX1250-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v26 +; GFX1250-NEXT: s_wait_loadcnt 0x1a +; GFX1250-NEXT: v_dual_cndmask_b32 v26, v36, v37, vcc_lo :: v_dual_bitop2_b32 v0, 1, v0 bitop3:0x40 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v24 +; GFX1250-NEXT: v_dual_lshrrev_b32 v37, 16, v37 :: v_dual_bitop2_b32 v2, 1, v2 bitop3:0x40 +; GFX1250-NEXT: s_wait_loadcnt 0x18 +; GFX1250-NEXT: v_dual_lshrrev_b32 v36, 16, v36 :: v_dual_cndmask_b32 v24, v38, v39, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v22 +; GFX1250-NEXT: v_dual_lshrrev_b32 v38, 16, v38 :: v_dual_bitop2_b32 v7, 1, v7 bitop3:0x40 +; GFX1250-NEXT: s_wait_loadcnt 0x16 +; GFX1250-NEXT: v_dual_cndmask_b32 v22, v48, v49 :: v_dual_lshrrev_b32 v39, 16, v39 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v20 +; GFX1250-NEXT: v_dual_lshrrev_b32 v49, 16, v49 :: v_dual_bitop2_b32 v8, 1, v8 bitop3:0x40 +; GFX1250-NEXT: s_wait_loadcnt 0x14 +; GFX1250-NEXT: v_dual_lshrrev_b32 v48, 16, v48 :: v_dual_cndmask_b32 v20, v50, v51, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v18 +; GFX1250-NEXT: v_dual_lshrrev_b32 v51, 16, v51 :: v_dual_bitop2_b32 v12, 1, v12 bitop3:0x40 +; GFX1250-NEXT: s_wait_loadcnt 0x12 +; GFX1250-NEXT: v_dual_lshrrev_b32 v50, 16, v50 :: v_dual_cndmask_b32 v18, v52, v53, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16 +; GFX1250-NEXT: v_dual_lshrrev_b32 v53, 16, v53 :: v_dual_bitop2_b32 v14, 1, v14 bitop3:0x40 +; GFX1250-NEXT: s_wait_loadcnt 0x10 +; GFX1250-NEXT: v_dual_lshrrev_b32 v52, 16, v52 :: v_dual_cndmask_b32 v16, v54, v55, vcc_lo +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14 +; GFX1250-NEXT: v_dual_lshrrev_b32 v55, 16, v55 :: v_dual_lshrrev_b32 v54, 16, v54 +; GFX1250-NEXT: s_wait_loadcnt 0xc +; GFX1250-NEXT: v_cndmask_b32_e32 v14, v66, v67, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 +; GFX1250-NEXT: v_dual_lshrrev_b32 v67, 16, v67 :: v_dual_lshrrev_b32 v66, 16, v66 +; GFX1250-NEXT: s_wait_loadcnt 0x8 +; GFX1250-NEXT: v_cndmask_b32_e32 v12, v70, v71, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 +; GFX1250-NEXT: v_dual_lshrrev_b32 v70, 16, v70 :: v_dual_bitop2_b32 v25, 1, v25 bitop3:0x40 +; GFX1250-NEXT: s_wait_loadcnt 0x5 +; GFX1250-NEXT: v_dual_cndmask_b32 v10, v81, v82 :: v_dual_lshrrev_b32 v71, 16, v71 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 +; GFX1250-NEXT: v_dual_lshrrev_b32 v82, 16, v82 :: v_dual_bitop2_b32 v27, 1, v27 bitop3:0x40 +; GFX1250-NEXT: v_dual_cndmask_b32 v8, v69, v80 :: v_dual_lshrrev_b32 v81, 16, v81 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX1250-NEXT: v_dual_lshrrev_b32 v80, 16, v80 :: v_dual_lshrrev_b32 v69, 16, v69 +; GFX1250-NEXT: s_wait_loadcnt 0x4 +; GFX1250-NEXT: v_dual_cndmask_b32 v6, v68, v29 :: v_dual_lshrrev_b32 v29, 16, v29 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX1250-NEXT: v_dual_lshrrev_b32 v68, 16, v68 :: v_dual_cndmask_b32 v4, v64, v65, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX1250-NEXT: v_dual_lshrrev_b32 v65, 16, v65 :: v_dual_lshrrev_b32 v64, 16, v64 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_dual_cndmask_b32 v2, v32, v33 :: v_dual_lshrrev_b32 v33, 16, v33 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_dual_lshrrev_b32 v32, 16, v32 :: v_dual_cndmask_b32 v0, v35, v83, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v27 +; GFX1250-NEXT: v_dual_lshrrev_b32 v83, 16, v83 :: v_dual_cndmask_b32 v27, v36, v37, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v25 +; GFX1250-NEXT: v_cndmask_b32_e32 v25, v38, v39, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v23 +; GFX1250-NEXT: v_dual_lshrrev_b32 v35, 16, v35 :: v_dual_cndmask_b32 v23, v48, v49, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v21 +; GFX1250-NEXT: v_cndmask_b32_e32 v21, v50, v51, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v19 +; GFX1250-NEXT: v_cndmask_b32_e32 v19, v52, v53, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v17 +; GFX1250-NEXT: v_cndmask_b32_e32 v17, v54, v55, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 +; GFX1250-NEXT: v_cndmask_b32_e32 v15, v66, v67, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13 +; GFX1250-NEXT: v_cndmask_b32_e32 v13, v70, v71, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11 +; GFX1250-NEXT: v_cndmask_b32_e32 v11, v81, v82, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 +; GFX1250-NEXT: v_cndmask_b32_e32 v7, v68, v29, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX1250-NEXT: v_cndmask_b32_e32 v3, v32, v33, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX1250-NEXT: v_cndmask_b32_e32 v1, v35, v83, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 +; GFX1250-NEXT: v_cndmask_b32_e32 v5, v64, v65, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 +; GFX1250-NEXT: v_cndmask_b32_e32 v9, v69, v80, vcc_lo +; GFX1250-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v4, v9, v8, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v5, v11, v10, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v6, v13, v12, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v7, v15, v14, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v8, v17, v16, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v9, v19, v18, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v10, v21, v20, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v11, v23, v22, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v12, v25, v24, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v13, v27, v26, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v14, v28, v31, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v15, v34, v30, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = select <32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b ret <32 x bfloat> %op } @@ -45987,6 +49163,13 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fma_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c) ret bfloat %op } @@ -46178,6 +49361,13 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fma_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v1, v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) ret <2 x bfloat> %op } @@ -46446,6 +49636,14 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v3, 16 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fma_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v2, v4 +; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v3, v5 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) ret <3 x bfloat> %op } @@ -46780,6 +49978,14 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fma_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v2, v4 +; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v3, v5 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) ret <4 x bfloat> %op } @@ -46915,6 +50121,13 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fmuladd_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.fmuladd.bf16(bfloat %a, bfloat %b, bfloat %c) ret bfloat %op } @@ -47114,6 +50327,13 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fmuladd_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v1, v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) ret <2 x bfloat> %op } @@ -47394,6 +50614,14 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v3, 16 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fmuladd_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v2, v4 +; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v3, v5 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) ret <3 x bfloat> %op } @@ -47744,6 +50972,16 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fmuladd_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v2, v4 +; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v3, v5 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) ret <4 x bfloat> %op } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX1250FAKE16: {{.*}}