diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll index 85180a2dc6348..c429b1a32bde6 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX942 %s define amdgpu_kernel void @select_and1(ptr addrspace(1) %p, i32 %x, i32 %y) { ; GCN-LABEL: select_and1: @@ -56,24 +57,43 @@ define amdgpu_kernel void @select_and3(ptr addrspace(1) %p, i32 %x, i32 %y) { } define amdgpu_kernel void @select_and_v4(ptr addrspace(1) %p, i32 %x, <4 x i32> %y) { -; GCN-LABEL: select_and_v4: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s8, s[4:5], 0x2c -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_gt_i32 s8, 10 -; GCN-NEXT: s_cselect_b32 s3, s3, 0 -; GCN-NEXT: s_cselect_b32 s2, s2, 0 -; GCN-NEXT: s_cselect_b32 s1, s1, 0 -; GCN-NEXT: s_cselect_b32 s0, s0, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] -; GCN-NEXT: s_endpgm +; GFX9-LABEL: select_and_v4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cmp_gt_i32 s8, 10 +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_cselect_b32 s1, s1, 0 +; GFX9-NEXT: s_cselect_b32 s0, s0, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX9-NEXT: s_endpgm +; +; GFX942-LABEL: select_and_v4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s8, s[4:5], 0x2c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_cmp_gt_i32 s8, 10 +; GFX942-NEXT: s_cselect_b32 s3, s3, 0 +; GFX942-NEXT: s_cselect_b32 s2, s2, 0 +; GFX942-NEXT: s_cselect_b32 s1, s1, 0 +; GFX942-NEXT: s_cselect_b32 s0, s0, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NEXT: v_mov_b32_e32 v3, s1 +; GFX942-NEXT: v_mov_b32_e32 v4, s2 +; GFX942-NEXT: v_mov_b32_e32 v5, s3 +; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[6:7] +; GFX942-NEXT: s_endpgm %c = icmp slt i32 %x, 11 %s = select i1 %c, <4 x i32> zeroinitializer, <4 x i32> %a = and <4 x i32> %s, %y @@ -136,24 +156,43 @@ define amdgpu_kernel void @select_or3(ptr addrspace(1) %p, i32 %x, i32 %y) { } define amdgpu_kernel void @select_or_v4(ptr addrspace(1) %p, i32 %x, <4 x i32> %y) { -; GCN-LABEL: select_or_v4: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s8, s[4:5], 0x2c -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lt_i32 s8, 11 -; GCN-NEXT: s_cselect_b32 s3, s3, -1 -; GCN-NEXT: s_cselect_b32 s2, s2, -1 -; GCN-NEXT: s_cselect_b32 s1, s1, -1 -; GCN-NEXT: s_cselect_b32 s0, s0, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] -; GCN-NEXT: s_endpgm +; GFX9-LABEL: select_or_v4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cmp_lt_i32 s8, 11 +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_cselect_b32 s1, s1, -1 +; GFX9-NEXT: s_cselect_b32 s0, s0, -1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX9-NEXT: s_endpgm +; +; GFX942-LABEL: select_or_v4: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s8, s[4:5], 0x2c +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_cmp_lt_i32 s8, 11 +; GFX942-NEXT: s_cselect_b32 s3, s3, -1 +; GFX942-NEXT: s_cselect_b32 s2, s2, -1 +; GFX942-NEXT: s_cselect_b32 s1, s1, -1 +; GFX942-NEXT: s_cselect_b32 s0, s0, -1 +; GFX942-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NEXT: v_mov_b32_e32 v3, s1 +; GFX942-NEXT: v_mov_b32_e32 v4, s2 +; GFX942-NEXT: v_mov_b32_e32 v5, s3 +; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[6:7] +; GFX942-NEXT: s_endpgm %c = icmp slt i32 %x, 11 %s = select i1 %c, <4 x i32> zeroinitializer, <4 x i32> %a = or <4 x i32> %s, %y @@ -236,23 +275,41 @@ define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_v2i16(ptr ad } define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_v4i32(ptr addrspace(1) %p, i1 %cond) { -; GCN-LABEL: sel_constants_sub_constant_sel_constants_v4i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bitcmp1_b32 s2, 0 -; GCN-NEXT: s_cselect_b32 s2, 7, 14 -; GCN-NEXT: s_cselect_b32 s3, 6, 10 -; GCN-NEXT: s_cselect_b32 s4, 5, 6 -; GCN-NEXT: s_cselect_b32 s5, 9, 2 -; GCN-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NEXT: v_mov_b32_e32 v1, s4 -; GCN-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NEXT: v_mov_b32_e32 v3, s2 -; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] -; GCN-NEXT: s_endpgm +; GFX9-LABEL: sel_constants_sub_constant_sel_constants_v4i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_bitcmp1_b32 s2, 0 +; GFX9-NEXT: s_cselect_b32 s2, 7, 14 +; GFX9-NEXT: s_cselect_b32 s3, 6, 10 +; GFX9-NEXT: s_cselect_b32 s4, 5, 6 +; GFX9-NEXT: s_cselect_b32 s5, 9, 2 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX942-LABEL: sel_constants_sub_constant_sel_constants_v4i32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_bitcmp1_b32 s2, 0 +; GFX942-NEXT: s_cselect_b32 s2, 7, 14 +; GFX942-NEXT: s_cselect_b32 s3, 6, 10 +; GFX942-NEXT: s_cselect_b32 s4, 5, 6 +; GFX942-NEXT: s_cselect_b32 s5, 9, 2 +; GFX942-NEXT: v_mov_b32_e32 v2, s5 +; GFX942-NEXT: v_mov_b32_e32 v3, s4 +; GFX942-NEXT: v_mov_b32_e32 v4, s3 +; GFX942-NEXT: v_mov_b32_e32 v5, s2 +; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; GFX942-NEXT: s_endpgm %sel = select i1 %cond, <4 x i32> , <4 x i32> %bo = sub <4 x i32> , %sel store <4 x i32> %bo, ptr addrspace(1) %p, align 32 @@ -461,24 +518,43 @@ define amdgpu_kernel void @fsub_constant_sel_constants_v2f16(ptr addrspace(1) %p } define amdgpu_kernel void @fsub_constant_sel_constants_v4f32(ptr addrspace(1) %p, i1 %cond) { -; GCN-LABEL: fsub_constant_sel_constants_v4f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-NEXT: s_mov_b32 s3, 0x41500000 -; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bitcmp1_b32 s2, 0 -; GCN-NEXT: s_cselect_b32 s2, s3, 0x40c00000 -; GCN-NEXT: s_cselect_b32 s3, 0x41100000, 4.0 -; GCN-NEXT: s_cselect_b32 s4, 0x40a00000, 2.0 -; GCN-NEXT: s_cselect_b32 s5, 1.0, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NEXT: v_mov_b32_e32 v1, s4 -; GCN-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NEXT: v_mov_b32_e32 v3, s2 -; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] -; GCN-NEXT: s_endpgm +; GFX9-LABEL: fsub_constant_sel_constants_v4f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0x41500000 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_bitcmp1_b32 s2, 0 +; GFX9-NEXT: s_cselect_b32 s2, s3, 0x40c00000 +; GFX9-NEXT: s_cselect_b32 s3, 0x41100000, 4.0 +; GFX9-NEXT: s_cselect_b32 s4, 0x40a00000, 2.0 +; GFX9-NEXT: s_cselect_b32 s5, 1.0, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX942-LABEL: fsub_constant_sel_constants_v4f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0x41500000 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_bitcmp1_b32 s2, 0 +; GFX942-NEXT: s_cselect_b32 s2, s3, 0x40c00000 +; GFX942-NEXT: s_cselect_b32 s3, 0x41100000, 4.0 +; GFX942-NEXT: s_cselect_b32 s4, 0x40a00000, 2.0 +; GFX942-NEXT: s_cselect_b32 s5, 1.0, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, s5 +; GFX942-NEXT: v_mov_b32_e32 v3, s4 +; GFX942-NEXT: v_mov_b32_e32 v4, s3 +; GFX942-NEXT: v_mov_b32_e32 v5, s2 +; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] +; GFX942-NEXT: s_endpgm %sel = select i1 %cond, <4 x float> , <4 x float> %bo = fsub <4 x float> , %sel store <4 x float> %bo, ptr addrspace(1) %p, align 32 diff --git a/llvm/test/CodeGen/AMDGPU/imm.ll b/llvm/test/CodeGen/AMDGPU/imm.ll index 58cfd40113be2..21390003ee565 100644 --- a/llvm/test/CodeGen/AMDGPU/imm.ll +++ b/llvm/test/CodeGen/AMDGPU/imm.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx942 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s ; Use a 64-bit value with lo bits that can be represented as an inline constant define amdgpu_kernel void @i64_imm_inline_lo(ptr addrspace(1) %out) { @@ -25,6 +26,17 @@ define amdgpu_kernel void @i64_imm_inline_lo(ptr addrspace(1) %out) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: i64_imm_inline_lo: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, 5 +; GFX942-NEXT: v_mov_b32_e32 v1, 0x12345678 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm entry: store i64 1311768464867721221, ptr addrspace(1) %out ; 0x1234567800000005 ret void @@ -53,6 +65,17 @@ define amdgpu_kernel void @i64_imm_inline_hi(ptr addrspace(1) %out) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: i64_imm_inline_hi: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x12345678 +; GFX942-NEXT: v_mov_b32_e32 v1, 5 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm entry: store i64 21780256376, ptr addrspace(1) %out ; 0x0000000512345678 ret void @@ -80,6 +103,17 @@ define amdgpu_kernel void @store_imm_neg_0.0_i64(ptr addrspace(1) %out) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: store_imm_neg_0.0_i64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm store i64 -9223372036854775808, ptr addrspace(1) %out ret void } @@ -104,6 +138,16 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i32(ptr addrspace(1) %out) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: store_inline_imm_neg_0.0_i32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_bfrev_b32_e32 v0, 1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm store i32 -2147483648, ptr addrspace(1) %out ret void } @@ -128,6 +172,16 @@ define amdgpu_kernel void @store_inline_imm_0.0_f32(ptr addrspace(1) %out) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: store_inline_imm_0.0_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm store float 0.0, ptr addrspace(1) %out ret void } @@ -152,6 +206,16 @@ define amdgpu_kernel void @store_imm_neg_0.0_f32(ptr addrspace(1) %out) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: store_imm_neg_0.0_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_bfrev_b32_e32 v0, 1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm store float -0.0, ptr addrspace(1) %out ret void } @@ -176,6 +240,16 @@ define amdgpu_kernel void @store_inline_imm_0.5_f32(ptr addrspace(1) %out) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: store_inline_imm_0.5_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, 0.5 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm store float 0.5, ptr addrspace(1) %out ret void } @@ -200,6 +274,16 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f32(ptr addrspace(1) %out) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: store_inline_imm_m_0.5_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, -0.5 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm store float -0.5, ptr addrspace(1) %out ret void } @@ -224,6 +308,16 @@ define amdgpu_kernel void @store_inline_imm_1.0_f32(ptr addrspace(1) %out) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: store_inline_imm_1.0_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm store float 1.0, ptr addrspace(1) %out ret void } @@ -248,6 +342,16 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f32(ptr addrspace(1) %out) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: store_inline_imm_m_1.0_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, -1.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm store float -1.0, ptr addrspace(1) %out ret void } @@ -272,6 +376,16 @@ define amdgpu_kernel void @store_inline_imm_2.0_f32(ptr addrspace(1) %out) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: store_inline_imm_2.0_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm store float 2.0, ptr addrspace(1) %out ret void } @@ -296,6 +410,16 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f32(ptr addrspace(1) %out) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: store_inline_imm_m_2.0_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, -2.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm store float -2.0, ptr addrspace(1) %out ret void } @@ -320,6 +444,16 @@ define amdgpu_kernel void @store_inline_imm_4.0_f32(ptr addrspace(1) %out) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: store_inline_imm_4.0_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, 4.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm store float 4.0, ptr addrspace(1) %out ret void } @@ -344,6 +478,16 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f32(ptr addrspace(1) %out) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: store_inline_imm_m_4.0_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, -4.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm store float -4.0, ptr addrspace(1) %out ret void } @@ -368,6 +512,16 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f32(ptr addrspace(1) %out) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: store_inline_imm_inv_2pi_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, 0.15915494 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm store float 0x3FC45F3060000000, ptr addrspace(1) %out ret void } @@ -392,6 +546,16 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f32(ptr addrspace(1) %out) ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: store_inline_imm_m_inv_2pi_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, 0xbe22f983 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm store float 0xBFC45F3060000000, ptr addrspace(1) %out ret void } @@ -416,6 +580,16 @@ define amdgpu_kernel void @store_literal_imm_f32(ptr addrspace(1) %out) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: store_literal_imm_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x45800000 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm store float 4096.0, ptr addrspace(1) %out ret void } @@ -442,6 +616,17 @@ define amdgpu_kernel void @add_inline_imm_0.0_f32(ptr addrspace(1) %out, float % ; VI-NEXT: v_add_f32_e64 v0, s6, 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_0.0_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f32_e64 v0, s6, 0 +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd float %x, 0.0 store float %y, ptr addrspace(1) %out ret void @@ -469,6 +654,17 @@ define amdgpu_kernel void @add_inline_imm_0.5_f32(ptr addrspace(1) %out, float % ; VI-NEXT: v_add_f32_e64 v0, s6, 0.5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_0.5_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f32_e64 v0, s6, 0.5 +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd float %x, 0.5 store float %y, ptr addrspace(1) %out ret void @@ -496,6 +692,17 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f32(ptr addrspace(1) %out, flo ; VI-NEXT: v_add_f32_e64 v0, s6, -0.5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_neg_0.5_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f32_e64 v0, s6, -0.5 +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd float %x, -0.5 store float %y, ptr addrspace(1) %out ret void @@ -523,6 +730,17 @@ define amdgpu_kernel void @add_inline_imm_1.0_f32(ptr addrspace(1) %out, float % ; VI-NEXT: v_add_f32_e64 v0, s6, 1.0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_1.0_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f32_e64 v0, s6, 1.0 +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd float %x, 1.0 store float %y, ptr addrspace(1) %out ret void @@ -550,6 +768,17 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f32(ptr addrspace(1) %out, flo ; VI-NEXT: v_add_f32_e64 v0, s6, -1.0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_neg_1.0_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f32_e64 v0, s6, -1.0 +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd float %x, -1.0 store float %y, ptr addrspace(1) %out ret void @@ -577,6 +806,17 @@ define amdgpu_kernel void @add_inline_imm_2.0_f32(ptr addrspace(1) %out, float % ; VI-NEXT: v_add_f32_e64 v0, s6, 2.0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_2.0_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f32_e64 v0, s6, 2.0 +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd float %x, 2.0 store float %y, ptr addrspace(1) %out ret void @@ -604,6 +844,17 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f32(ptr addrspace(1) %out, flo ; VI-NEXT: v_add_f32_e64 v0, s6, -2.0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_neg_2.0_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f32_e64 v0, s6, -2.0 +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd float %x, -2.0 store float %y, ptr addrspace(1) %out ret void @@ -631,6 +882,17 @@ define amdgpu_kernel void @add_inline_imm_4.0_f32(ptr addrspace(1) %out, float % ; VI-NEXT: v_add_f32_e64 v0, s6, 4.0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_4.0_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f32_e64 v0, s6, 4.0 +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd float %x, 4.0 store float %y, ptr addrspace(1) %out ret void @@ -658,6 +920,17 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f32(ptr addrspace(1) %out, flo ; VI-NEXT: v_add_f32_e64 v0, s6, -4.0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_neg_4.0_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f32_e64 v0, s6, -4.0 +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd float %x, -4.0 store float %y, ptr addrspace(1) %out ret void @@ -699,6 +972,24 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(ptr addrspace(1) %out, ; VI-NEXT: v_add_f32_e32 v0, 0.5, v0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: commute_add_inline_imm_0.5_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s7, 0xf000 +; GFX942-NEXT: s_mov_b32 s6, -1 +; GFX942-NEXT: s_mov_b32 s10, s6 +; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX942-NEXT: s_mov_b32 s4, s0 +; GFX942-NEXT: s_mov_b32 s5, s1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_add_f32_e32 v0, 0.5, v0 +; GFX942-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX942-NEXT: s_endpgm %x = load float, ptr addrspace(1) %in %y = fadd float %x, 0.5 store float %y, ptr addrspace(1) %out @@ -741,6 +1032,24 @@ define amdgpu_kernel void @commute_add_literal_f32(ptr addrspace(1) %out, ptr ad ; VI-NEXT: v_add_f32_e32 v0, 0x44800000, v0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: commute_add_literal_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s7, 0xf000 +; GFX942-NEXT: s_mov_b32 s6, -1 +; GFX942-NEXT: s_mov_b32 s10, s6 +; GFX942-NEXT: s_mov_b32 s11, s7 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_mov_b32 s8, s2 +; GFX942-NEXT: s_mov_b32 s9, s3 +; GFX942-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX942-NEXT: s_mov_b32 s4, s0 +; GFX942-NEXT: s_mov_b32 s5, s1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_add_f32_e32 v0, 0x44800000, v0 +; GFX942-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX942-NEXT: s_endpgm %x = load float, ptr addrspace(1) %in %y = fadd float %x, 1024.0 store float %y, ptr addrspace(1) %out @@ -769,6 +1078,17 @@ define amdgpu_kernel void @add_inline_imm_1_f32(ptr addrspace(1) %out, float %x) ; VI-NEXT: v_add_f32_e64 v0, s6, 1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_1_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f32_e64 v0, s6, 1 +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd float %x, 0x36a0000000000000 store float %y, ptr addrspace(1) %out ret void @@ -796,6 +1116,17 @@ define amdgpu_kernel void @add_inline_imm_2_f32(ptr addrspace(1) %out, float %x) ; VI-NEXT: v_add_f32_e64 v0, s6, 2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_2_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f32_e64 v0, s6, 2 +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd float %x, 0x36b0000000000000 store float %y, ptr addrspace(1) %out ret void @@ -823,6 +1154,17 @@ define amdgpu_kernel void @add_inline_imm_16_f32(ptr addrspace(1) %out, float %x ; VI-NEXT: v_add_f32_e64 v0, s6, 16 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_16_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f32_e64 v0, s6, 16 +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd float %x, 0x36e0000000000000 store float %y, ptr addrspace(1) %out ret void @@ -852,6 +1194,18 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f32(ptr addrspace(1) %out, float ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_neg_1_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_add_i32 s4, s6, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, s4 +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %xbc = bitcast float %x to i32 %y = add i32 %xbc, -1 %ybc = bitcast i32 %y to float @@ -883,6 +1237,18 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f32(ptr addrspace(1) %out, float ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_neg_2_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_add_i32 s4, s6, -2 +; GFX942-NEXT: v_mov_b32_e32 v0, s4 +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %xbc = bitcast float %x to i32 %y = add i32 %xbc, -2 %ybc = bitcast i32 %y to float @@ -914,6 +1280,18 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f32(ptr addrspace(1) %out, floa ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_neg_16_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_add_i32 s4, s6, -16 +; GFX942-NEXT: v_mov_b32_e32 v0, s4 +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %xbc = bitcast float %x to i32 %y = add i32 %xbc, -16 %ybc = bitcast i32 %y to float @@ -943,6 +1321,17 @@ define amdgpu_kernel void @add_inline_imm_63_f32(ptr addrspace(1) %out, float %x ; VI-NEXT: v_add_f32_e64 v0, s6, 63 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_63_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f32_e64 v0, s6, 63 +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd float %x, 0x36ff800000000000 store float %y, ptr addrspace(1) %out ret void @@ -970,6 +1359,17 @@ define amdgpu_kernel void @add_inline_imm_64_f32(ptr addrspace(1) %out, float %x ; VI-NEXT: v_add_f32_e64 v0, s6, 64 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_64_f32: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f32_e64 v0, s6, 64 +; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd float %x, 0x3700000000000000 store float %y, ptr addrspace(1) %out ret void @@ -999,6 +1399,17 @@ define amdgpu_kernel void @add_inline_imm_0.0_f64(ptr addrspace(1) %out, [8 x i3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_0.0_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], 0 +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd double %x, 0.0 store double %y, ptr addrspace(1) %out ret void @@ -1028,6 +1439,17 @@ define amdgpu_kernel void @add_inline_imm_0.5_f64(ptr addrspace(1) %out, [8 x i3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_0.5_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], 0.5 +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd double %x, 0.5 store double %y, ptr addrspace(1) %out ret void @@ -1057,6 +1479,17 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(ptr addrspace(1) %out, [8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_neg_0.5_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], -0.5 +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd double %x, -0.5 store double %y, ptr addrspace(1) %out ret void @@ -1086,6 +1519,17 @@ define amdgpu_kernel void @add_inline_imm_1.0_f64(ptr addrspace(1) %out, [8 x i3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_1.0_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], 1.0 +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd double %x, 1.0 store double %y, ptr addrspace(1) %out ret void @@ -1115,6 +1559,17 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(ptr addrspace(1) %out, [8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_neg_1.0_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], -1.0 +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd double %x, -1.0 store double %y, ptr addrspace(1) %out ret void @@ -1144,6 +1599,17 @@ define amdgpu_kernel void @add_inline_imm_2.0_f64(ptr addrspace(1) %out, [8 x i3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_2.0_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], 2.0 +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd double %x, 2.0 store double %y, ptr addrspace(1) %out ret void @@ -1173,6 +1639,17 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(ptr addrspace(1) %out, [8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_neg_2.0_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], -2.0 +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd double %x, -2.0 store double %y, ptr addrspace(1) %out ret void @@ -1202,6 +1679,17 @@ define amdgpu_kernel void @add_inline_imm_4.0_f64(ptr addrspace(1) %out, [8 x i3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_4.0_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], 4.0 +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd double %x, 4.0 store double %y, ptr addrspace(1) %out ret void @@ -1231,6 +1719,17 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(ptr addrspace(1) %out, [8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_neg_4.0_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], -4.0 +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd double %x, -4.0 store double %y, ptr addrspace(1) %out ret void @@ -1262,6 +1761,17 @@ define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(ptr addrspace(1) %out, [8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_inv_2pi_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], 0.15915494309189532 +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd double %x, 0x3fc45f306dc9c882 store double %y, ptr addrspace(1) %out ret void @@ -1295,6 +1805,19 @@ define amdgpu_kernel void @add_m_inv_2pi_f64(ptr addrspace(1) %out, [8 x i32], d ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_m_inv_2pi_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; GFX942-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 +; GFX942-NEXT: v_mov_b32_e32 v1, 0xbfc45f30 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd double %x, 0xbfc45f306dc9c882 store double %y, ptr addrspace(1) %out ret void @@ -1324,6 +1847,17 @@ define amdgpu_kernel void @add_inline_imm_1_f64(ptr addrspace(1) %out, [8 x i32] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_1_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], 1 +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd double %x, 0x0000000000000001 store double %y, ptr addrspace(1) %out ret void @@ -1353,6 +1887,17 @@ define amdgpu_kernel void @add_inline_imm_2_f64(ptr addrspace(1) %out, [8 x i32] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_2_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], 2 +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd double %x, 0x0000000000000002 store double %y, ptr addrspace(1) %out ret void @@ -1382,6 +1927,17 @@ define amdgpu_kernel void @add_inline_imm_16_f64(ptr addrspace(1) %out, [8 x i32 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_16_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], 16 +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd double %x, 0x0000000000000010 store double %y, ptr addrspace(1) %out ret void @@ -1409,6 +1965,17 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f64(ptr addrspace(1) %out, [8 x ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_neg_1_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v0, -1 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd double %x, 0xffffffffffffffff store double %y, ptr addrspace(1) %out ret void @@ -1436,6 +2003,17 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f64(ptr addrspace(1) %out, [8 x ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_neg_2_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, -2 +; GFX942-NEXT: v_mov_b32_e32 v1, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd double %x, 0xfffffffffffffffe store double %y, ptr addrspace(1) %out ret void @@ -1463,6 +2041,17 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f64(ptr addrspace(1) %out, [8 x ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_neg_16_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, -16 +; GFX942-NEXT: v_mov_b32_e32 v1, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd double %x, 0xfffffffffffffff0 store double %y, ptr addrspace(1) %out ret void @@ -1492,6 +2081,17 @@ define amdgpu_kernel void @add_inline_imm_63_f64(ptr addrspace(1) %out, [8 x i32 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_63_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], 63 +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd double %x, 0x000000000000003F store double %y, ptr addrspace(1) %out ret void @@ -1521,6 +2121,17 @@ define amdgpu_kernel void @add_inline_imm_64_f64(ptr addrspace(1) %out, [8 x i32 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: add_inline_imm_64_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], 64 +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm %y = fadd double %x, 0x0000000000000040 store double %y, ptr addrspace(1) %out ret void @@ -1548,6 +2159,17 @@ define amdgpu_kernel void @store_inline_imm_0.0_f64(ptr addrspace(1) %out) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: store_inline_imm_0.0_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm store double 0.0, ptr addrspace(1) %out ret void } @@ -1574,6 +2196,17 @@ define amdgpu_kernel void @store_literal_imm_neg_0.0_f64(ptr addrspace(1) %out) ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: store_literal_imm_neg_0.0_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm store double -0.0, ptr addrspace(1) %out ret void } @@ -1600,6 +2233,17 @@ define amdgpu_kernel void @store_inline_imm_0.5_f64(ptr addrspace(1) %out) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: store_inline_imm_0.5_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0x3fe00000 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm store double 0.5, ptr addrspace(1) %out ret void } @@ -1626,6 +2270,17 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f64(ptr addrspace(1) %out) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: store_inline_imm_m_0.5_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0xbfe00000 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm store double -0.5, ptr addrspace(1) %out ret void } @@ -1652,6 +2307,17 @@ define amdgpu_kernel void @store_inline_imm_1.0_f64(ptr addrspace(1) %out) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: store_inline_imm_1.0_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0x3ff00000 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm store double 1.0, ptr addrspace(1) %out ret void } @@ -1678,6 +2344,17 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f64(ptr addrspace(1) %out) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: store_inline_imm_m_1.0_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0xbff00000 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm store double -1.0, ptr addrspace(1) %out ret void } @@ -1704,6 +2381,17 @@ define amdgpu_kernel void @store_inline_imm_2.0_f64(ptr addrspace(1) %out) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: store_inline_imm_2.0_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm store double 2.0, ptr addrspace(1) %out ret void } @@ -1730,6 +2418,17 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f64(ptr addrspace(1) %out) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: store_inline_imm_m_2.0_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, -2.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm store double -2.0, ptr addrspace(1) %out ret void } @@ -1756,6 +2455,17 @@ define amdgpu_kernel void @store_inline_imm_4.0_f64(ptr addrspace(1) %out) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: store_inline_imm_4.0_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm store double 4.0, ptr addrspace(1) %out ret void } @@ -1782,6 +2492,17 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f64(ptr addrspace(1) %out) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: store_inline_imm_m_4.0_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0xc0100000 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm store double -4.0, ptr addrspace(1) %out ret void } @@ -1808,6 +2529,17 @@ define amdgpu_kernel void @store_inv_2pi_f64(ptr addrspace(1) %out) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: store_inv_2pi_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 +; GFX942-NEXT: v_mov_b32_e32 v1, 0x3fc45f30 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm store double 0x3fc45f306dc9c882, ptr addrspace(1) %out ret void } @@ -1834,6 +2566,17 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f64(ptr addrspace(1) %out) ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: store_inline_imm_m_inv_2pi_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 +; GFX942-NEXT: v_mov_b32_e32 v1, 0xbfc45f30 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm store double 0xbfc45f306dc9c882, ptr addrspace(1) %out ret void } @@ -1860,6 +2603,17 @@ define amdgpu_kernel void @store_literal_imm_f64(ptr addrspace(1) %out) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: store_literal_imm_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: s_mov_b32 s3, 0xf000 +; GFX942-NEXT: s_mov_b32 s2, -1 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0x40b00000 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX942-NEXT: s_endpgm store double 4096.0, ptr addrspace(1) %out ret void } @@ -1871,6 +2625,13 @@ define amdgpu_vs void @literal_folding(float %arg) { ; GCN-NEXT: v_mul_f32_e32 v0, 0xbf4353f8, v0 ; GCN-NEXT: exp pos0 v1, v1, v0, v0 done ; GCN-NEXT: s_endpgm +; +; GFX942-LABEL: literal_folding: +; GFX942: ; %bb.0: ; %main_body +; GFX942-NEXT: v_mul_f32_e32 v1, 0x3f4353f8, v0 +; GFX942-NEXT: v_mul_f32_e32 v0, 0xbf4353f8, v0 +; GFX942-NEXT: exp pos0 v1, v1, v0, v0 done +; GFX942-NEXT: s_endpgm main_body: %tmp = fmul float %arg, 0x3FE86A7F00000000 %tmp1 = fmul float %arg, 0xBFE86A7F00000000 diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll index 6a45b961a61c8..101787abf8ea7 100644 --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -32,6 +33,16 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: sint_to_fp_i32_to_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_cvt_f64_i32_e32 v[0:1], s2 +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_endpgm %result = sitofp i32 %in to double store double %result, ptr addrspace(1) %out ret void @@ -73,6 +84,18 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: sint_to_fp_i1_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_cmp_eq_u32 s2, 0 +; GFX942-NEXT: s_cselect_b32 s2, 0xbff00000, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] +; GFX942-NEXT: s_endpgm %cmp = icmp eq i32 %in, 0 %fp = sitofp i1 %cmp to double store double %fp, ptr addrspace(1) %out, align 4 @@ -113,6 +136,19 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in) ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: sint_to_fp_i1_f64_load: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_bitcmp1_b32 s2, 0 +; GFX942-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3] +; GFX942-NEXT: v_cvt_f64_i32_e32 v[0:1], v0 +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_endpgm %fp = sitofp i1 %in to double store double %fp, ptr addrspace(1) %out, align 8 ret void @@ -150,6 +186,18 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: s_sint_to_fp_i64_to_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_cvt_f64_i32_e32 v[0:1], s3 +; GFX942-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 +; GFX942-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 +; GFX942-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX942-NEXT: s_endpgm %result = sitofp i64 %in to double store double %result, ptr addrspace(1) %out ret void @@ -199,6 +247,22 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: v_sint_to_fp_i64_to_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_cvt_f64_i32_e32 v[2:3], v1 +; GFX942-NEXT: v_ldexp_f64 v[2:3], v[2:3], 32 +; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX942-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] +; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX942-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid %val = load i64, ptr addrspace(1) %gep, align 8 @@ -238,6 +302,17 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: s_sint_to_fp_i8_to_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_sext_i32_i8 s2, s2 +; GFX942-NEXT: v_cvt_f64_i32_e32 v[0:1], s2 +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_endpgm %fp = sitofp i8 %in to double store double %fp, ptr addrspace(1) %out ret void @@ -258,6 +333,14 @@ define double @v_sint_to_fp_i8_to_f64(i8 %in) { ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 ; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], v0 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: v_sint_to_fp_i8_to_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX942-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX942-NEXT: v_cvt_f64_i32_e32 v[0:1], v0 +; GFX942-NEXT: s_setpc_b64 s[30:31] %fp = sitofp i8 %in to double ret double %fp } @@ -296,6 +379,18 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: s_select_sint_to_fp_i1_vals_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_cmp_eq_u32 s2, 0 +; GFX942-NEXT: s_cselect_b32 s2, 0xbff00000, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] +; GFX942-NEXT: s_endpgm %cmp = icmp eq i32 %in, 0 %select = select i1 %cmp, double -1.0, double 0.0 store double %select, ptr addrspace(1) %out, align 8 @@ -313,6 +408,18 @@ define void @v_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: v_select_sint_to_fp_i1_vals_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0xbff00000 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc +; GFX942-NEXT: global_store_dwordx2 v[0:1], v[4:5], off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %in, 0 %select = select i1 %cmp, double -1.0, double 0.0 store double %select, ptr addrspace(1) %out, align 8 @@ -353,6 +460,18 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: s_select_sint_to_fp_i1_vals_i64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_cmp_eq_u32 s2, 0 +; GFX942-NEXT: s_cselect_b32 s2, 0xbff00000, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] +; GFX942-NEXT: s_endpgm %cmp = icmp eq i32 %in, 0 %select = select i1 %cmp, i64 u0xbff0000000000000, i64 0 store i64 %select, ptr addrspace(1) %out, align 8 @@ -370,6 +489,18 @@ define void @v_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) { ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: v_select_sint_to_fp_i1_vals_i64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0xbff00000 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc +; GFX942-NEXT: global_store_dwordx2 v[0:1], v[4:5], off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %in, 0 %select = select i1 %cmp, i64 u0xbff0000000000000, i64 0 store i64 %select, ptr addrspace(1) %out, align 8 @@ -388,6 +519,18 @@ define void @v_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: v_swap_select_sint_to_fp_i1_vals_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0xbff00000 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e64 v5, v3, 0, vcc +; GFX942-NEXT: global_store_dwordx2 v[0:1], v[4:5], off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %in, 0 %select = select i1 %cmp, double 0.0, double -1.0 store double %select, ptr addrspace(1) %out, align 8 @@ -429,6 +572,18 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: s_swap_select_sint_to_fp_i1_vals_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_cmp_eq_u32 s2, 0 +; GFX942-NEXT: s_cselect_b32 s2, 0, 0xbff00000 +; GFX942-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] +; GFX942-NEXT: s_endpgm %cmp = icmp eq i32 %in, 0 %select = select i1 %cmp, double 0.0, double -1.0 store double %select, ptr addrspace(1) %out, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll index ab278c3b63a3e..983acfc2c0699 100644 --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -48,6 +49,22 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: v_uint_to_fp_i64_to_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_cvt_f64_u32_e32 v[2:3], v1 +; GFX942-NEXT: v_ldexp_f64 v[2:3], v[2:3], 32 +; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX942-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] +; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX942-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid %val = load i64, ptr addrspace(1) %gep, align 8 @@ -88,6 +105,18 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: s_uint_to_fp_i64_to_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 +; GFX942-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 +; GFX942-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 +; GFX942-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX942-NEXT: s_endpgm %cast = uitofp i64 %in to double store double %cast, ptr addrspace(1) %out, align 8 ret void @@ -136,6 +165,23 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: s_uint_to_fp_v2i64_to_v2f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 +; GFX942-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 +; GFX942-NEXT: v_cvt_f64_u32_e32 v[4:5], s1 +; GFX942-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 +; GFX942-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX942-NEXT: v_ldexp_f64 v[0:1], v[4:5], 32 +; GFX942-NEXT: v_cvt_f64_u32_e32 v[4:5], s0 +; GFX942-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] +; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[6:7] +; GFX942-NEXT: s_endpgm %cast = uitofp <2 x i64> %in to <2 x double> store <2 x double> %cast, ptr addrspace(1) %out, align 16 ret void @@ -210,6 +256,32 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 ; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: s_uint_to_fp_v4i64_to_v4f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], s11 +; GFX942-NEXT: v_cvt_f64_u32_e32 v[2:3], s10 +; GFX942-NEXT: v_cvt_f64_u32_e32 v[4:5], s9 +; GFX942-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 +; GFX942-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX942-NEXT: v_ldexp_f64 v[0:1], v[4:5], 32 +; GFX942-NEXT: v_cvt_f64_u32_e32 v[4:5], s8 +; GFX942-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] +; GFX942-NEXT: v_cvt_f64_u32_e32 v[4:5], s15 +; GFX942-NEXT: v_ldexp_f64 v[4:5], v[4:5], 32 +; GFX942-NEXT: v_cvt_f64_u32_e32 v[6:7], s14 +; GFX942-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7] +; GFX942-NEXT: v_cvt_f64_u32_e32 v[4:5], s13 +; GFX942-NEXT: v_ldexp_f64 v[4:5], v[4:5], 32 +; GFX942-NEXT: v_cvt_f64_u32_e32 v[8:9], s12 +; GFX942-NEXT: v_add_f64 v[4:5], v[4:5], v[8:9] +; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX942-NEXT: s_endpgm %cast = uitofp <4 x i64> %in to <4 x double> store <4 x double> %cast, ptr addrspace(1) %out, align 16 ret void @@ -243,6 +315,16 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: s_uint_to_fp_i32_to_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_endpgm %cast = uitofp i32 %in to double store double %cast, ptr addrspace(1) %out, align 8 ret void @@ -262,6 +344,16 @@ define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2 ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm +; +; GFX942-LABEL: s_uint_to_fp_v2i32_to_v2f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_cvt_f64_u32_e32 v[2:3], s3 +; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_endpgm %cast = uitofp <2 x i32> %in to <2 x double> store <2 x double> %cast, ptr addrspace(1) %out, align 16 ret void @@ -313,6 +405,20 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: s_uint_to_fp_v4i32_to_v4f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_cvt_f64_u32_e32 v[6:7], s3 +; GFX942-NEXT: v_cvt_f64_u32_e32 v[4:5], s2 +; GFX942-NEXT: v_cvt_f64_u32_e32 v[2:3], s1 +; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7] +; GFX942-NEXT: s_endpgm %cast = uitofp <4 x i32> %in to <4 x double> store <4 x double> %cast, ptr addrspace(1) %out, align 16 ret void @@ -354,6 +460,18 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: uint_to_fp_i1_to_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_cmp_eq_u32 s2, 0 +; GFX942-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] +; GFX942-NEXT: s_endpgm %cmp = icmp eq i32 %in, 0 %fp = uitofp i1 %cmp to double store double %fp, ptr addrspace(1) %out, align 4 @@ -394,6 +512,19 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 % ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: uint_to_fp_i1_to_f64_load: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_bitcmp1_b32 s2, 0 +; GFX942-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_endpgm %fp = uitofp i1 %in to double store double %fp, ptr addrspace(1) %out, align 8 ret void @@ -429,6 +560,17 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: s_uint_to_fp_i8_to_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_and_b32 s2, s2, 0xff +; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: s_endpgm %fp = uitofp i8 %in to double store double %fp, ptr addrspace(1) %out ret void @@ -450,6 +592,14 @@ define double @v_uint_to_fp_i8_to_f64(i8 %in) { ; VI-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: v_uint_to_fp_i8_to_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b32 s0, 0xffff +; GFX942-NEXT: v_and_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX942-NEXT: s_setpc_b64 s[30:31] %fp = uitofp i8 %in to double ret double %fp } @@ -488,6 +638,18 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: s_select_uint_to_fp_i1_vals_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_cmp_eq_u32 s2, 0 +; GFX942-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] +; GFX942-NEXT: s_endpgm %cmp = icmp eq i32 %in, 0 %select = select i1 %cmp, double 1.0, double 0.0 store double %select, ptr addrspace(1) %out, align 8 @@ -505,6 +667,18 @@ define void @v_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: v_select_uint_to_fp_i1_vals_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0x3ff00000 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc +; GFX942-NEXT: global_store_dwordx2 v[0:1], v[4:5], off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %in, 0 %select = select i1 %cmp, double 1.0, double 0.0 store double %select, ptr addrspace(1) %out, align 8 @@ -545,6 +719,18 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: s_select_uint_to_fp_i1_vals_i64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_cmp_eq_u32 s2, 0 +; GFX942-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] +; GFX942-NEXT: s_endpgm %cmp = icmp eq i32 %in, 0 %select = select i1 %cmp, i64 u0x3ff0000000000000, i64 0 store i64 %select, ptr addrspace(1) %out, align 8 @@ -562,6 +748,18 @@ define void @v_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) { ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: v_select_uint_to_fp_i1_vals_i64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0x3ff00000 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc +; GFX942-NEXT: global_store_dwordx2 v[0:1], v[4:5], off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %in, 0 %select = select i1 %cmp, i64 u0x3ff0000000000000, i64 0 store i64 %select, ptr addrspace(1) %out, align 8 @@ -603,6 +801,18 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX942-LABEL: s_swap_select_uint_to_fp_i1_vals_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_cmp_eq_u32 s2, 0 +; GFX942-NEXT: s_cselect_b32 s2, 0, 0x3ff00000 +; GFX942-NEXT: v_mov_b32_e32 v1, s2 +; GFX942-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] +; GFX942-NEXT: s_endpgm %cmp = icmp eq i32 %in, 0 %select = select i1 %cmp, double 0.0, double 1.0 store double %select, ptr addrspace(1) %out, align 8 @@ -620,6 +830,18 @@ define void @v_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: v_swap_select_uint_to_fp_i1_vals_f64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v3, 0x3ff00000 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e64 v5, v3, 0, vcc +; GFX942-NEXT: global_store_dwordx2 v[0:1], v[4:5], off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %in, 0 %select = select i1 %cmp, double 0.0, double 1.0 store double %select, ptr addrspace(1) %out, align 8