diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll index 1ca67c4acf7a4..239408d31482f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll @@ -2,7 +2,8 @@ ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -o - < %s | FileCheck %s --check-prefixes=GFX,GFX6 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -o - < %s | FileCheck %s --check-prefixes=GFX,GFX8 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -o - < %s | FileCheck %s --check-prefixes=GFX,GFX10 -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 -o - < %s | FileCheck %s --check-prefixes=GFX,GFX1250 +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -o - < %s | FileCheck %s --check-prefixes=GFX,GFX1250,GFX1250-FAKE16 +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -o - < %s | FileCheck %s --check-prefixes=GFX,GFX1250,GFX1250-REAL16 declare i16 @llvm.abs.i16(i16, i1) declare i32 @llvm.abs.i32(i32, i1) @@ -170,14 +171,23 @@ define i16 @abs_vgpr_i16(i16 %arg) { ; GFX10-NEXT: v_max_i16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: abs_vgpr_i16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_sub_nc_u16 v1, 0, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_max_i16 v0, v0, v1 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: abs_vgpr_i16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_sub_nc_u16 v1, 0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_max_i16 v0, v0, v1 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: abs_vgpr_i16: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_sub_nc_u16 v0.h, 0, v0.l +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_max_i16 v0.l, v0.l, v0.h +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %res = call i16 @llvm.abs.i16(i16 %arg, i1 false) ret i16 %res } @@ -390,19 +400,33 @@ define <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) { ; GFX10-NEXT: v_max_i16 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: abs_vgpr_v2i8: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_sub_nc_u16 v2, 0, v0 -; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_max_i16 v0, v0, v2 -; GFX1250-NEXT: v_max_i16 v1, v1, v3 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: abs_vgpr_v2i8: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX1250-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-FAKE16-NEXT: v_sub_nc_u16 v2, 0, v0 +; GFX1250-FAKE16-NEXT: v_sub_nc_u16 v3, 0, v1 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-FAKE16-NEXT: v_max_i16 v0, v0, v2 +; GFX1250-FAKE16-NEXT: v_max_i16 v1, v1, v3 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: abs_vgpr_v2i8: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_bfe_i32 v2, v0, 0, 8 +; GFX1250-REAL16-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-REAL16-NEXT: v_sub_nc_u16 v0.l, 0, v2.l +; GFX1250-REAL16-NEXT: v_sub_nc_u16 v0.h, 0, v1.l +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-REAL16-NEXT: v_max_i16 v0.l, v2.l, v0.l +; GFX1250-REAL16-NEXT: v_max_i16 v1.l, v1.l, v0.h +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %res = call <2 x i8> @llvm.abs.v2i8(<2 x i8> %arg, i1 false) ret <2 x i8> %res } @@ -493,23 +517,41 @@ define <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) { ; GFX10-NEXT: v_max_i16 v2, v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: abs_vgpr_v3i8: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX1250-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v0 -; GFX1250-NEXT: v_sub_nc_u16 v4, 0, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_sub_nc_u16 v5, 0, v2 -; GFX1250-NEXT: v_max_i16 v0, v0, v3 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_max_i16 v1, v1, v4 -; GFX1250-NEXT: v_max_i16 v2, v2, v5 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: abs_vgpr_v3i8: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX1250-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX1250-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-FAKE16-NEXT: v_sub_nc_u16 v3, 0, v0 +; GFX1250-FAKE16-NEXT: v_sub_nc_u16 v4, 0, v1 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-FAKE16-NEXT: v_sub_nc_u16 v5, 0, v2 +; GFX1250-FAKE16-NEXT: v_max_i16 v0, v0, v3 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-FAKE16-NEXT: v_max_i16 v1, v1, v4 +; GFX1250-FAKE16-NEXT: v_max_i16 v2, v2, v5 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: abs_vgpr_v3i8: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_bfe_i32 v3, v0, 0, 8 +; GFX1250-REAL16-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX1250-REAL16-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-REAL16-NEXT: v_sub_nc_u16 v0.l, 0, v3.l +; GFX1250-REAL16-NEXT: v_sub_nc_u16 v0.h, 0, v1.l +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-REAL16-NEXT: v_sub_nc_u16 v1.h, 0, v2.l +; GFX1250-REAL16-NEXT: v_max_i16 v0.l, v3.l, v0.l +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-REAL16-NEXT: v_max_i16 v1.l, v1.l, v0.h +; GFX1250-REAL16-NEXT: v_max_i16 v2.l, v2.l, v1.h +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %res = call <3 x i8> @llvm.abs.v3i8(<3 x i8> %arg, i1 false) ret <3 x i8> %res } @@ -694,16 +736,27 @@ define <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) { ; GFX10-NEXT: v_max_i16 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: abs_vgpr_v3i16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_pk_sub_i16 v2, 0, v0 -; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_pk_max_i16 v0, v0, v2 -; GFX1250-NEXT: v_max_i16 v1, v1, v3 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: abs_vgpr_v3i16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_pk_sub_i16 v2, 0, v0 +; GFX1250-FAKE16-NEXT: v_sub_nc_u16 v3, 0, v1 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-FAKE16-NEXT: v_pk_max_i16 v0, v0, v2 +; GFX1250-FAKE16-NEXT: v_max_i16 v1, v1, v3 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: abs_vgpr_v3i16: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_pk_sub_i16 v2, 0, v0 +; GFX1250-REAL16-NEXT: v_sub_nc_u16 v1.h, 0, v1.l +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-REAL16-NEXT: v_pk_max_i16 v0, v0, v2 +; GFX1250-REAL16-NEXT: v_max_i16 v1.l, v1.l, v1.h +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %arg, i1 false) ret <3 x i16> %res } diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-sin-cos-f16-f32.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-sin-cos-f16-f32.ll index 802a9722c237a..97572e995f155 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn-sin-cos-f16-f32.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn-sin-cos-f16-f32.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "s_setreg_imm32_b32" --filter-out "shader" --version 6 -; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,FAKE16 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,REAL16 %s define amdgpu_ps float @v_sin_f32(float %src) #1 { ; GCN-LABEL: v_sin_f32: @@ -18,17 +19,25 @@ define amdgpu_ps float @s_sin_f32(float inreg %src) #1 { } define amdgpu_ps half @v_sin_f16(half %src) #1 { -; GCN-LABEL: v_sin_f16: -; GCN: ; %bb.0: -; GCN: v_sin_f16_e32 v0, v0 +; FAKE16-LABEL: v_sin_f16: +; FAKE16: ; %bb.0: +; FAKE16: v_sin_f16_e32 v0, v0 +; +; REAL16-LABEL: v_sin_f16: +; REAL16: ; %bb.0: +; REAL16: v_sin_f16_e32 v0.l, v0.l %sin = call half @llvm.amdgcn.sin.f16(half %src) #0 ret half %sin } define amdgpu_ps half @s_sin_f16(half inreg %src) #1 { -; GCN-LABEL: s_sin_f16: -; GCN: ; %bb.0: -; GCN: v_sin_f16_e32 v0, s0 +; FAKE16-LABEL: s_sin_f16: +; FAKE16: ; %bb.0: +; FAKE16: v_sin_f16_e32 v0, s0 +; +; REAL16-LABEL: s_sin_f16: +; REAL16: ; %bb.0: +; REAL16: v_sin_f16_e32 v0.l, s0 %sin = call half @llvm.amdgcn.sin.f16(half %src) #0 ret half %sin } @@ -50,17 +59,25 @@ define amdgpu_ps float @s_cos_f32(float inreg %src) #1 { } define amdgpu_ps half @v_cos_f16(half %src) #1 { -; GCN-LABEL: v_cos_f16: -; GCN: ; %bb.0: -; GCN: v_cos_f16_e32 v0, v0 +; FAKE16-LABEL: v_cos_f16: +; FAKE16: ; %bb.0: +; FAKE16: v_cos_f16_e32 v0, v0 +; +; REAL16-LABEL: v_cos_f16: +; REAL16: ; %bb.0: +; REAL16: v_cos_f16_e32 v0.l, v0.l %cos = call half @llvm.amdgcn.cos.f16(half %src) #0 ret half %cos } define amdgpu_ps half @s_cos_f16(half inreg %src) #1 { -; GCN-LABEL: s_cos_f16: -; GCN: ; %bb.0: -; GCN: v_cos_f16_e32 v0, s0 +; FAKE16-LABEL: s_cos_f16: +; FAKE16: ; %bb.0: +; FAKE16: v_cos_f16_e32 v0, s0 +; +; REAL16-LABEL: s_cos_f16: +; REAL16: ; %bb.0: +; REAL16: v_cos_f16_e32 v0.l, s0 %cos = call half @llvm.amdgcn.cos.f16(half %src) #0 ret half %cos } diff --git a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll index 54871a622189b..fe9bad1de22d4 100644 --- a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN:llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GFX1250 %s +; RUN:llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX1250,FAKE16 %s +; RUN:llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX1250,REAL16 %s define float @global_system_atomic_fadd_f32(ptr addrspace(1) %ptr, float %val) { ; GFX1250-LABEL: global_system_atomic_fadd_f32: @@ -338,173 +339,325 @@ define i64 @global_system_atomic_umax_i64(ptr addrspace(1) %ptr, i64 %val) { } define i16 @global_one_as_atomic_min_i16(ptr addrspace(1) %ptr, i16 %val) { -; GFX1250-LABEL: global_one_as_atomic_min_i16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v3, v0 -; GFX1250-NEXT: s_mov_b32 s0, 0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX1250-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX1250-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX1250-NEXT: global_load_b32 v5, v[0:1], off -; GFX1250-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_not_b32_e32 v4, v4 -; GFX1250-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v7, v5 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX1250-NEXT: v_min_i16 v5, v5, v2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: s_cbranch_execnz .LBB28_1 -; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; FAKE16-LABEL: global_one_as_atomic_min_i16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; FAKE16-NEXT: s_mov_b32 s0, 0 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; FAKE16-NEXT: v_not_b32_e32 v4, v4 +; FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start +; FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; FAKE16-NEXT: s_wait_loadcnt 0x0 +; FAKE16-NEXT: v_mov_b32_e32 v7, v5 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; FAKE16-NEXT: v_min_i16 v5, v5, v2 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; FAKE16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; FAKE16-NEXT: s_wait_xcnt 0x0 +; FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; FAKE16-NEXT: s_wait_loadcnt 0x0 +; FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; FAKE16-NEXT: s_wait_xcnt 0x0 +; FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; FAKE16-NEXT: s_cbranch_execnz .LBB28_1 +; FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; REAL16-LABEL: global_one_as_atomic_min_i16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: v_mov_b32_e32 v3, v0 +; REAL16-NEXT: s_mov_b32 s0, 0 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_and_b32_e32 v0, -4, v3 +; REAL16-NEXT: v_and_b32_e32 v3, 3, v3 +; REAL16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; REAL16-NEXT: global_load_b32 v5, v[0:1], off +; REAL16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; REAL16-NEXT: v_not_b32_e32 v4, v4 +; REAL16-NEXT: .LBB28_1: ; %atomicrmw.start +; REAL16-NEXT: ; =>This Inner Loop Header: Depth=1 +; REAL16-NEXT: s_wait_loadcnt 0x0 +; REAL16-NEXT: v_mov_b32_e32 v7, v5 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; REAL16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; REAL16-NEXT: v_mov_b16_e32 v5.h, 0 +; REAL16-NEXT: v_min_i16 v5.l, v5.l, v2.l +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; REAL16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; REAL16-NEXT: s_wait_xcnt 0x0 +; REAL16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; REAL16-NEXT: s_wait_loadcnt 0x0 +; REAL16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; REAL16-NEXT: s_or_b32 s0, vcc_lo, s0 +; REAL16-NEXT: s_wait_xcnt 0x0 +; REAL16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; REAL16-NEXT: s_cbranch_execnz .LBB28_1 +; REAL16-NEXT: ; %bb.2: ; %atomicrmw.end +; REAL16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; REAL16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; REAL16-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw min ptr addrspace(1) %ptr, i16 %val syncscope("one-as") monotonic ret i16 %result } define i16 @global_one_as_atomic_umin_i16(ptr addrspace(1) %ptr, i16 %val) { -; GFX1250-LABEL: global_one_as_atomic_umin_i16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v3, v0 -; GFX1250-NEXT: s_mov_b32 s0, 0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX1250-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX1250-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX1250-NEXT: global_load_b32 v5, v[0:1], off -; GFX1250-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_not_b32_e32 v4, v4 -; GFX1250-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v7, v5 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX1250-NEXT: v_min_u16 v5, v5, v2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: s_cbranch_execnz .LBB29_1 -; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; FAKE16-LABEL: global_one_as_atomic_umin_i16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; FAKE16-NEXT: s_mov_b32 s0, 0 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; FAKE16-NEXT: v_not_b32_e32 v4, v4 +; FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start +; FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; FAKE16-NEXT: s_wait_loadcnt 0x0 +; FAKE16-NEXT: v_mov_b32_e32 v7, v5 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; FAKE16-NEXT: v_min_u16 v5, v5, v2 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; FAKE16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; FAKE16-NEXT: s_wait_xcnt 0x0 +; FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; FAKE16-NEXT: s_wait_loadcnt 0x0 +; FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; FAKE16-NEXT: s_wait_xcnt 0x0 +; FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; FAKE16-NEXT: s_cbranch_execnz .LBB29_1 +; FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; REAL16-LABEL: global_one_as_atomic_umin_i16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: v_mov_b32_e32 v3, v0 +; REAL16-NEXT: s_mov_b32 s0, 0 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_and_b32_e32 v0, -4, v3 +; REAL16-NEXT: v_and_b32_e32 v3, 3, v3 +; REAL16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; REAL16-NEXT: global_load_b32 v5, v[0:1], off +; REAL16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; REAL16-NEXT: v_not_b32_e32 v4, v4 +; REAL16-NEXT: .LBB29_1: ; %atomicrmw.start +; REAL16-NEXT: ; =>This Inner Loop Header: Depth=1 +; REAL16-NEXT: s_wait_loadcnt 0x0 +; REAL16-NEXT: v_mov_b32_e32 v7, v5 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; REAL16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; REAL16-NEXT: v_mov_b16_e32 v5.h, 0 +; REAL16-NEXT: v_min_u16 v5.l, v5.l, v2.l +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; REAL16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; REAL16-NEXT: s_wait_xcnt 0x0 +; REAL16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; REAL16-NEXT: s_wait_loadcnt 0x0 +; REAL16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; REAL16-NEXT: s_or_b32 s0, vcc_lo, s0 +; REAL16-NEXT: s_wait_xcnt 0x0 +; REAL16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; REAL16-NEXT: s_cbranch_execnz .LBB29_1 +; REAL16-NEXT: ; %bb.2: ; %atomicrmw.end +; REAL16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; REAL16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; REAL16-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw umin ptr addrspace(1) %ptr, i16 %val syncscope("one-as") monotonic ret i16 %result } define i16 @global_one_as_atomic_max_i16(ptr addrspace(1) %ptr, i16 %val) { -; GFX1250-LABEL: global_one_as_atomic_max_i16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v3, v0 -; GFX1250-NEXT: s_mov_b32 s0, 0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX1250-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX1250-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX1250-NEXT: global_load_b32 v5, v[0:1], off -; GFX1250-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_not_b32_e32 v4, v4 -; GFX1250-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v7, v5 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX1250-NEXT: v_max_i16 v5, v5, v2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: s_cbranch_execnz .LBB30_1 -; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; FAKE16-LABEL: global_one_as_atomic_max_i16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; FAKE16-NEXT: s_mov_b32 s0, 0 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; FAKE16-NEXT: v_not_b32_e32 v4, v4 +; FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start +; FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; FAKE16-NEXT: s_wait_loadcnt 0x0 +; FAKE16-NEXT: v_mov_b32_e32 v7, v5 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; FAKE16-NEXT: v_max_i16 v5, v5, v2 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; FAKE16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; FAKE16-NEXT: s_wait_xcnt 0x0 +; FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; FAKE16-NEXT: s_wait_loadcnt 0x0 +; FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; FAKE16-NEXT: s_wait_xcnt 0x0 +; FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; FAKE16-NEXT: s_cbranch_execnz .LBB30_1 +; FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; REAL16-LABEL: global_one_as_atomic_max_i16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: v_mov_b32_e32 v3, v0 +; REAL16-NEXT: s_mov_b32 s0, 0 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_and_b32_e32 v0, -4, v3 +; REAL16-NEXT: v_and_b32_e32 v3, 3, v3 +; REAL16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; REAL16-NEXT: global_load_b32 v5, v[0:1], off +; REAL16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; REAL16-NEXT: v_not_b32_e32 v4, v4 +; REAL16-NEXT: .LBB30_1: ; %atomicrmw.start +; REAL16-NEXT: ; =>This Inner Loop Header: Depth=1 +; REAL16-NEXT: s_wait_loadcnt 0x0 +; REAL16-NEXT: v_mov_b32_e32 v7, v5 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; REAL16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; REAL16-NEXT: v_mov_b16_e32 v5.h, 0 +; REAL16-NEXT: v_max_i16 v5.l, v5.l, v2.l +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; REAL16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; REAL16-NEXT: s_wait_xcnt 0x0 +; REAL16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; REAL16-NEXT: s_wait_loadcnt 0x0 +; REAL16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; REAL16-NEXT: s_or_b32 s0, vcc_lo, s0 +; REAL16-NEXT: s_wait_xcnt 0x0 +; REAL16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; REAL16-NEXT: s_cbranch_execnz .LBB30_1 +; REAL16-NEXT: ; %bb.2: ; %atomicrmw.end +; REAL16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; REAL16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; REAL16-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw max ptr addrspace(1) %ptr, i16 %val syncscope("one-as") monotonic ret i16 %result } define i16 @global_one_as_atomic_umax_i16(ptr addrspace(1) %ptr, i16 %val) { -; GFX1250-LABEL: global_one_as_atomic_umax_i16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v3, v0 -; GFX1250-NEXT: s_mov_b32 s0, 0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX1250-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX1250-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX1250-NEXT: global_load_b32 v5, v[0:1], off -; GFX1250-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_not_b32_e32 v4, v4 -; GFX1250-NEXT: .LBB31_1: ; %atomicrmw.start -; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v7, v5 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX1250-NEXT: v_max_u16 v5, v5, v2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: s_cbranch_execnz .LBB31_1 -; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; FAKE16-LABEL: global_one_as_atomic_umax_i16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; FAKE16-NEXT: s_mov_b32 s0, 0 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; FAKE16-NEXT: v_not_b32_e32 v4, v4 +; FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start +; FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; FAKE16-NEXT: s_wait_loadcnt 0x0 +; FAKE16-NEXT: v_mov_b32_e32 v7, v5 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; FAKE16-NEXT: v_max_u16 v5, v5, v2 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; FAKE16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; FAKE16-NEXT: s_wait_xcnt 0x0 +; FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; FAKE16-NEXT: s_wait_loadcnt 0x0 +; FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; FAKE16-NEXT: s_wait_xcnt 0x0 +; FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; FAKE16-NEXT: s_cbranch_execnz .LBB31_1 +; FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; REAL16-LABEL: global_one_as_atomic_umax_i16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: v_mov_b32_e32 v3, v0 +; REAL16-NEXT: s_mov_b32 s0, 0 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_and_b32_e32 v0, -4, v3 +; REAL16-NEXT: v_and_b32_e32 v3, 3, v3 +; REAL16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; REAL16-NEXT: global_load_b32 v5, v[0:1], off +; REAL16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; REAL16-NEXT: v_not_b32_e32 v4, v4 +; REAL16-NEXT: .LBB31_1: ; %atomicrmw.start +; REAL16-NEXT: ; =>This Inner Loop Header: Depth=1 +; REAL16-NEXT: s_wait_loadcnt 0x0 +; REAL16-NEXT: v_mov_b32_e32 v7, v5 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; REAL16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; REAL16-NEXT: v_mov_b16_e32 v5.h, 0 +; REAL16-NEXT: v_max_u16 v5.l, v5.l, v2.l +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; REAL16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; REAL16-NEXT: s_wait_xcnt 0x0 +; REAL16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; REAL16-NEXT: s_wait_loadcnt 0x0 +; REAL16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; REAL16-NEXT: s_or_b32 s0, vcc_lo, s0 +; REAL16-NEXT: s_wait_xcnt 0x0 +; REAL16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; REAL16-NEXT: s_cbranch_execnz .LBB31_1 +; REAL16-NEXT: ; %bb.2: ; %atomicrmw.end +; REAL16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; REAL16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; REAL16-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw umax ptr addrspace(1) %ptr, i16 %val syncscope("one-as") monotonic ret i16 %result } @@ -1312,173 +1465,325 @@ define i64 @flat_system_atomic_umax_i64(ptr %ptr, i64 %val) { } define i16 @flat_one_as_atomic_min_i16(ptr %ptr, i16 %val) { -; GFX1250-LABEL: flat_one_as_atomic_min_i16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v3, v0 -; GFX1250-NEXT: s_mov_b32 s0, 0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX1250-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX1250-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX1250-NEXT: flat_load_b32 v5, v[0:1] -; GFX1250-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_not_b32_e32 v4, v4 -; GFX1250-NEXT: .LBB60_1: ; %atomicrmw.start -; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v7, v5 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX1250-NEXT: v_min_i16 v5, v5, v2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: s_cbranch_execnz .LBB60_1 -; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; FAKE16-LABEL: flat_one_as_atomic_min_i16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; FAKE16-NEXT: s_mov_b32 s0, 0 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; FAKE16-NEXT: v_not_b32_e32 v4, v4 +; FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start +; FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; FAKE16-NEXT: v_mov_b32_e32 v7, v5 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; FAKE16-NEXT: v_min_i16 v5, v5, v2 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; FAKE16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; FAKE16-NEXT: s_wait_xcnt 0x0 +; FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; FAKE16-NEXT: s_wait_xcnt 0x0 +; FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; FAKE16-NEXT: s_cbranch_execnz .LBB60_1 +; FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; REAL16-LABEL: flat_one_as_atomic_min_i16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: v_mov_b32_e32 v3, v0 +; REAL16-NEXT: s_mov_b32 s0, 0 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_and_b32_e32 v0, -4, v3 +; REAL16-NEXT: v_and_b32_e32 v3, 3, v3 +; REAL16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; REAL16-NEXT: flat_load_b32 v5, v[0:1] +; REAL16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; REAL16-NEXT: v_not_b32_e32 v4, v4 +; REAL16-NEXT: .LBB60_1: ; %atomicrmw.start +; REAL16-NEXT: ; =>This Inner Loop Header: Depth=1 +; REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; REAL16-NEXT: v_mov_b32_e32 v7, v5 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; REAL16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; REAL16-NEXT: v_mov_b16_e32 v5.h, 0 +; REAL16-NEXT: v_min_i16 v5.l, v5.l, v2.l +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; REAL16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; REAL16-NEXT: s_wait_xcnt 0x0 +; REAL16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; REAL16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; REAL16-NEXT: s_or_b32 s0, vcc_lo, s0 +; REAL16-NEXT: s_wait_xcnt 0x0 +; REAL16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; REAL16-NEXT: s_cbranch_execnz .LBB60_1 +; REAL16-NEXT: ; %bb.2: ; %atomicrmw.end +; REAL16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; REAL16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; REAL16-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw min ptr %ptr, i16 %val syncscope("one-as") monotonic ret i16 %result } define i16 @flat_one_as_atomic_umin_i16(ptr %ptr, i16 %val) { -; GFX1250-LABEL: flat_one_as_atomic_umin_i16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v3, v0 -; GFX1250-NEXT: s_mov_b32 s0, 0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX1250-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX1250-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX1250-NEXT: flat_load_b32 v5, v[0:1] -; GFX1250-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_not_b32_e32 v4, v4 -; GFX1250-NEXT: .LBB61_1: ; %atomicrmw.start -; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v7, v5 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX1250-NEXT: v_min_u16 v5, v5, v2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: s_cbranch_execnz .LBB61_1 -; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; FAKE16-LABEL: flat_one_as_atomic_umin_i16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; FAKE16-NEXT: s_mov_b32 s0, 0 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; FAKE16-NEXT: v_not_b32_e32 v4, v4 +; FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start +; FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; FAKE16-NEXT: v_mov_b32_e32 v7, v5 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; FAKE16-NEXT: v_min_u16 v5, v5, v2 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; FAKE16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; FAKE16-NEXT: s_wait_xcnt 0x0 +; FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; FAKE16-NEXT: s_wait_xcnt 0x0 +; FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; FAKE16-NEXT: s_cbranch_execnz .LBB61_1 +; FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; REAL16-LABEL: flat_one_as_atomic_umin_i16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: v_mov_b32_e32 v3, v0 +; REAL16-NEXT: s_mov_b32 s0, 0 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_and_b32_e32 v0, -4, v3 +; REAL16-NEXT: v_and_b32_e32 v3, 3, v3 +; REAL16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; REAL16-NEXT: flat_load_b32 v5, v[0:1] +; REAL16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; REAL16-NEXT: v_not_b32_e32 v4, v4 +; REAL16-NEXT: .LBB61_1: ; %atomicrmw.start +; REAL16-NEXT: ; =>This Inner Loop Header: Depth=1 +; REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; REAL16-NEXT: v_mov_b32_e32 v7, v5 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; REAL16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; REAL16-NEXT: v_mov_b16_e32 v5.h, 0 +; REAL16-NEXT: v_min_u16 v5.l, v5.l, v2.l +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; REAL16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; REAL16-NEXT: s_wait_xcnt 0x0 +; REAL16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; REAL16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; REAL16-NEXT: s_or_b32 s0, vcc_lo, s0 +; REAL16-NEXT: s_wait_xcnt 0x0 +; REAL16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; REAL16-NEXT: s_cbranch_execnz .LBB61_1 +; REAL16-NEXT: ; %bb.2: ; %atomicrmw.end +; REAL16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; REAL16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; REAL16-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw umin ptr %ptr, i16 %val syncscope("one-as") monotonic ret i16 %result } define i16 @flat_one_as_atomic_max_i16(ptr %ptr, i16 %val) { -; GFX1250-LABEL: flat_one_as_atomic_max_i16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v3, v0 -; GFX1250-NEXT: s_mov_b32 s0, 0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX1250-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX1250-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX1250-NEXT: flat_load_b32 v5, v[0:1] -; GFX1250-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_not_b32_e32 v4, v4 -; GFX1250-NEXT: .LBB62_1: ; %atomicrmw.start -; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v7, v5 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX1250-NEXT: v_max_i16 v5, v5, v2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: s_cbranch_execnz .LBB62_1 -; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; FAKE16-LABEL: flat_one_as_atomic_max_i16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; FAKE16-NEXT: s_mov_b32 s0, 0 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; FAKE16-NEXT: v_not_b32_e32 v4, v4 +; FAKE16-NEXT: .LBB62_1: ; %atomicrmw.start +; FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; FAKE16-NEXT: v_mov_b32_e32 v7, v5 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; FAKE16-NEXT: v_max_i16 v5, v5, v2 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; FAKE16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; FAKE16-NEXT: s_wait_xcnt 0x0 +; FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; FAKE16-NEXT: s_wait_xcnt 0x0 +; FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; FAKE16-NEXT: s_cbranch_execnz .LBB62_1 +; FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; REAL16-LABEL: flat_one_as_atomic_max_i16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: v_mov_b32_e32 v3, v0 +; REAL16-NEXT: s_mov_b32 s0, 0 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_and_b32_e32 v0, -4, v3 +; REAL16-NEXT: v_and_b32_e32 v3, 3, v3 +; REAL16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; REAL16-NEXT: flat_load_b32 v5, v[0:1] +; REAL16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; REAL16-NEXT: v_not_b32_e32 v4, v4 +; REAL16-NEXT: .LBB62_1: ; %atomicrmw.start +; REAL16-NEXT: ; =>This Inner Loop Header: Depth=1 +; REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; REAL16-NEXT: v_mov_b32_e32 v7, v5 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; REAL16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; REAL16-NEXT: v_mov_b16_e32 v5.h, 0 +; REAL16-NEXT: v_max_i16 v5.l, v5.l, v2.l +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; REAL16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; REAL16-NEXT: s_wait_xcnt 0x0 +; REAL16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; REAL16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; REAL16-NEXT: s_or_b32 s0, vcc_lo, s0 +; REAL16-NEXT: s_wait_xcnt 0x0 +; REAL16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; REAL16-NEXT: s_cbranch_execnz .LBB62_1 +; REAL16-NEXT: ; %bb.2: ; %atomicrmw.end +; REAL16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; REAL16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; REAL16-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw max ptr %ptr, i16 %val syncscope("one-as") monotonic ret i16 %result } define i16 @flat_one_as_atomic_umax_i16(ptr %ptr, i16 %val) { -; GFX1250-LABEL: flat_one_as_atomic_umax_i16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v3, v0 -; GFX1250-NEXT: s_mov_b32 s0, 0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX1250-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX1250-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX1250-NEXT: flat_load_b32 v5, v[0:1] -; GFX1250-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_not_b32_e32 v4, v4 -; GFX1250-NEXT: .LBB63_1: ; %atomicrmw.start -; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v7, v5 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX1250-NEXT: v_max_u16 v5, v5, v2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 -; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: s_cbranch_execnz .LBB63_1 -; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: v_lshrrev_b32_e32 v0, v3, v5 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; FAKE16-LABEL: flat_one_as_atomic_umax_i16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: v_mov_b32_e32 v3, v0 +; FAKE16-NEXT: s_mov_b32 s0, 0 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 +; FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 +; FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; FAKE16-NEXT: flat_load_b32 v5, v[0:1] +; FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; FAKE16-NEXT: v_not_b32_e32 v4, v4 +; FAKE16-NEXT: .LBB63_1: ; %atomicrmw.start +; FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; FAKE16-NEXT: v_mov_b32_e32 v7, v5 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; FAKE16-NEXT: v_max_u16 v5, v5, v2 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; FAKE16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; FAKE16-NEXT: s_wait_xcnt 0x0 +; FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; FAKE16-NEXT: s_wait_xcnt 0x0 +; FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; FAKE16-NEXT: s_cbranch_execnz .LBB63_1 +; FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; REAL16-LABEL: flat_one_as_atomic_umax_i16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: v_mov_b32_e32 v3, v0 +; REAL16-NEXT: s_mov_b32 s0, 0 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_and_b32_e32 v0, -4, v3 +; REAL16-NEXT: v_and_b32_e32 v3, 3, v3 +; REAL16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; REAL16-NEXT: flat_load_b32 v5, v[0:1] +; REAL16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; REAL16-NEXT: v_not_b32_e32 v4, v4 +; REAL16-NEXT: .LBB63_1: ; %atomicrmw.start +; REAL16-NEXT: ; =>This Inner Loop Header: Depth=1 +; REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; REAL16-NEXT: v_mov_b32_e32 v7, v5 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; REAL16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 +; REAL16-NEXT: v_mov_b16_e32 v5.h, 0 +; REAL16-NEXT: v_max_u16 v5.l, v5.l, v2.l +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; REAL16-NEXT: v_and_or_b32 v6, v7, v4, v5 +; REAL16-NEXT: s_wait_xcnt 0x0 +; REAL16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; REAL16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 +; REAL16-NEXT: s_or_b32 s0, vcc_lo, s0 +; REAL16-NEXT: s_wait_xcnt 0x0 +; REAL16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; REAL16-NEXT: s_cbranch_execnz .LBB63_1 +; REAL16-NEXT: ; %bb.2: ; %atomicrmw.end +; REAL16-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; REAL16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; REAL16-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw umax ptr %ptr, i16 %val syncscope("one-as") monotonic ret i16 %result } diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll index dab095786b5e9..e1da8a7a52a51 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX-942 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GCN,GFX-950 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GFX1250 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-REAL16 %s ; TODO: Add global-isel when it can support bf16 @@ -11,11 +12,20 @@ define amdgpu_ps float @v_test_cvt_bf16_f32_v(bfloat %v) { ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX1250-LABEL: v_test_cvt_bf16_f32_v: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-FAKE16-LABEL: v_test_cvt_bf16_f32_v: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-REAL16-LABEL: v_test_cvt_bf16_f32_v: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_mov_b32_e32 v0, v1 +; GFX1250-REAL16-NEXT: ; return to shader part epilog %cvt = fpext bfloat %v to float ret float %cvt } @@ -131,13 +141,22 @@ define amdgpu_ps float @v_test_cvt_f32_bf16_v(float %src) { ; GFX-950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX-950-NEXT: ; return to shader part epilog ; -; GFX1250-LABEL: v_test_cvt_f32_bf16_v: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-FAKE16-LABEL: v_test_cvt_f32_bf16_v: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-REAL16-LABEL: v_test_cvt_f32_bf16_v: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: v_cvt_pk_bf16_f32 v1, v0, s0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX1250-REAL16-NEXT: ; return to shader part epilog %trunc = fptrunc float %src to bfloat %ext = fpext bfloat %trunc to float ret float %ext diff --git a/llvm/test/CodeGen/AMDGPU/code-size-estimate-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/code-size-estimate-gfx1250.ll index 75af0b5f15306..5f82bec69a6b3 100644 --- a/llvm/test/CodeGen/AMDGPU/code-size-estimate-gfx1250.ll +++ b/llvm/test/CodeGen/AMDGPU/code-size-estimate-gfx1250.ll @@ -1,13 +1,21 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX1250 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -show-mc-encoding -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -show-mc-encoding -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,REAL16 %s define i16 @cvt_pk_bf8_f16_v(ptr addrspace(1) %out) { -; GFX1250-LABEL: cvt_pk_bf8_f16_v: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf] -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf] -; GFX1250-NEXT: v_cvt_pk_bf8_f16 v0, 0x38003800 ; encoding: [0x00,0x00,0x73,0xd7,0xff,0x00,0x01,0x02,0x00,0x38,0x00,0x38] -; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +; FAKE16-LABEL: cvt_pk_bf8_f16_v: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf] +; FAKE16-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf] +; FAKE16-NEXT: v_cvt_pk_bf8_f16 v0, 0x38003800 ; encoding: [0x00,0x00,0x73,0xd7,0xff,0x00,0x01,0x02,0x00,0x38,0x00,0x38] +; FAKE16-NEXT: s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +; +; REAL16-LABEL: cvt_pk_bf8_f16_v: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf] +; REAL16-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf] +; REAL16-NEXT: v_cvt_pk_bf8_f16 v0.l, 0x38003800 ; encoding: [0x00,0x00,0x73,0xd7,0xff,0x00,0x01,0x02,0x00,0x38,0x00,0x38] +; REAL16-NEXT: s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] %cvt = tail call i16 @llvm.amdgcn.cvt.pk.bf8.f16(<2 x half> ) ret i16 %cvt } @@ -15,14 +23,23 @@ define i16 @cvt_pk_bf8_f16_v(ptr addrspace(1) %out) { ; GFX1250: codeLenInByte = 24 define i16 @cvt_pk_fp8_f16_v(ptr addrspace(1) %out) { -; GFX1250-LABEL: cvt_pk_fp8_f16_v: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf] -; GFX1250-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf] -; GFX1250-NEXT: v_cvt_pk_fp8_f16 v0, 0x3800 ; encoding: [0x00,0x00,0x72,0xd7,0xff,0x00,0x01,0x02,0x00,0x38,0x00,0x00] -; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +; FAKE16-LABEL: cvt_pk_fp8_f16_v: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf] +; FAKE16-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf] +; FAKE16-NEXT: v_cvt_pk_fp8_f16 v0, 0x3800 ; encoding: [0x00,0x00,0x72,0xd7,0xff,0x00,0x01,0x02,0x00,0x38,0x00,0x00] +; FAKE16-NEXT: s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +; +; REAL16-LABEL: cvt_pk_fp8_f16_v: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf] +; REAL16-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf] +; REAL16-NEXT: v_cvt_pk_fp8_f16 v0.l, 0x3800 ; encoding: [0x00,0x00,0x72,0xd7,0xff,0x00,0x01,0x02,0x00,0x38,0x00,0x00] +; REAL16-NEXT: s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] %cvt = tail call i16 @llvm.amdgcn.cvt.pk.fp8.f16(<2 x half> ) ret i16 %cvt } ; GFX1250: codeLenInByte = 24 +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX1250: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.bf16.ll index 85e7038b38563..9c0a7a16f4e50 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.bf16.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 %s -o - | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefixes=GCN,FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefixes=GCN,REAL16 %s define float @test_canonicalize_amdgcn_tanh_f32(float %a) { ; GCN-LABEL: test_canonicalize_amdgcn_tanh_f32: @@ -14,30 +15,50 @@ define float @test_canonicalize_amdgcn_tanh_f32(float %a) { } define bfloat @test_canonicalize_amdgcn_tanh_bf16(bfloat %a) { -; GCN-LABEL: test_canonicalize_amdgcn_tanh_bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_tanh_bf16_e32 v0, v0 -; GCN-NEXT: v_nop -; GCN-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_max_num_f32_e32 v0, v0, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 -; GCN-NEXT: s_set_pc_i64 s[30:31] +; FAKE16-LABEL: test_canonicalize_amdgcn_tanh_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: v_tanh_bf16_e32 v0, v0 +; FAKE16-NEXT: v_nop +; FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; FAKE16-NEXT: v_max_num_f32_e32 v0, v0, v0 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; FAKE16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; REAL16-LABEL: test_canonicalize_amdgcn_tanh_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: v_tanh_bf16_e32 v0.h, v0.l +; REAL16-NEXT: v_nop +; REAL16-NEXT: v_mov_b16_e32 v0.l, 0 +; REAL16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_max_num_f32_e32 v0, v0, v0 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; REAL16-NEXT: s_set_pc_i64 s[30:31] %tanh = call bfloat @llvm.amdgcn.tanh.bf16(bfloat %a) %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %tanh) ret bfloat %canonicalized } define half @test_canonicalize_amdgcn_tanh_f16(half %a) { -; GCN-LABEL: test_canonicalize_amdgcn_tanh_f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_tanh_f16_e32 v0, v0 -; GCN-NEXT: s_set_pc_i64 s[30:31] +; FAKE16-LABEL: test_canonicalize_amdgcn_tanh_f16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: v_tanh_f16_e32 v0, v0 +; FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; REAL16-LABEL: test_canonicalize_amdgcn_tanh_f16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: v_tanh_f16_e32 v0.l, v0.l +; REAL16-NEXT: s_set_pc_i64 s[30:31] %tanh = call half @llvm.amdgcn.tanh.f16(half %a) %canonicalized = call half @llvm.canonicalize.f16(half %tanh) ret half %canonicalized diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll index 65e0757b00c06..5805552c0132c 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250,FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250,REAL16 %s declare bfloat @llvm.fabs.bf16(bfloat) #0 declare bfloat @llvm.canonicalize.bf16(bfloat) #0 @@ -16,14 +17,24 @@ declare <64 x bfloat> @llvm.canonicalize.v64bf16(<64 x bfloat>) #0 declare i32 @llvm.amdgcn.workitem.id.x() #0 define amdgpu_kernel void @test_fold_canonicalize_undef_value_bf16(ptr addrspace(1) %out) #1 { -; GFX1250-LABEL: test_fold_canonicalize_undef_value_bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_store_b16 v0, v0, s[0:1] -; GFX1250-NEXT: s_endpgm +; FAKE16-LABEL: test_fold_canonicalize_undef_value_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v0, v0, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: test_fold_canonicalize_undef_value_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: v_mov_b16_e32 v0.l, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat undef) store bfloat %canonicalized, ptr addrspace(1) %out ret void @@ -70,15 +81,29 @@ define amdgpu_kernel void @s_test_canonicalize_var_bf16(ptr addrspace(1) %out, i } define <2 x bfloat> @v_test_canonicalize_build_vector_v2bf16(bfloat %lo, bfloat %hi) #1 { -; GFX1250-LABEL: v_test_canonicalize_build_vector_v2bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; FAKE16-LABEL: v_test_canonicalize_build_vector_v2bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 +; FAKE16-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; REAL16-LABEL: v_test_canonicalize_build_vector_v2bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: v_mov_b16_e32 v2.l, 0 +; REAL16-NEXT: v_mov_b16_e32 v2.h, v1.l +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_max_num_f32_e32 v1, v2, v2 +; REAL16-NEXT: v_mov_b16_e32 v2.h, v0.l +; REAL16-NEXT: v_max_num_f32_e32 v0, v2, v2 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; REAL16-NEXT: s_set_pc_i64 s[30:31] %ins0 = insertelement <2 x bfloat> poison, bfloat %lo, i32 0 %ins1 = insertelement <2 x bfloat> %ins0, bfloat %hi, i32 1 %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %ins1) @@ -87,22 +112,39 @@ define <2 x bfloat> @v_test_canonicalize_build_vector_v2bf16(bfloat %lo, bfloat define amdgpu_kernel void @v_test_canonicalize_fabs_var_bf16(ptr addrspace(1) %out) #1 { -; GFX1250-LABEL: v_test_canonicalize_fabs_var_bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_u16 v1, v0, s[0:1] -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 -; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; FAKE16-LABEL: v_test_canonicalize_fabs_var_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] +; FAKE16-NEXT: s_wait_loadcnt 0x0 +; FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; FAKE16-NEXT: v_max_num_f32_e32 v1, v1, v1 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; FAKE16-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: v_test_canonicalize_fabs_var_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_load_u16 v0, v1, s[0:1] +; REAL16-NEXT: s_wait_loadcnt 0x0 +; REAL16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; REAL16-NEXT: v_max_num_f32_e32 v0, v0, v0 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %val = load bfloat, ptr addrspace(1) %out %val.fabs = call bfloat @llvm.fabs.bf16(bfloat %val) %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val.fabs) @@ -112,22 +154,39 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_bf16(ptr addrspace(1) %o define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_bf16(ptr addrspace(1) %out) #1 { -; GFX1250-LABEL: v_test_canonicalize_fneg_fabs_var_bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_u16 v1, v0, s[0:1] -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_or_b32_e32 v1, 0x8000, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 -; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; FAKE16-LABEL: v_test_canonicalize_fneg_fabs_var_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] +; FAKE16-NEXT: s_wait_loadcnt 0x0 +; FAKE16-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; FAKE16-NEXT: v_max_num_f32_e32 v1, v1, v1 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; FAKE16-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: v_test_canonicalize_fneg_fabs_var_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_load_u16 v0, v1, s[0:1] +; REAL16-NEXT: s_wait_loadcnt 0x0 +; REAL16-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; REAL16-NEXT: v_max_num_f32_e32 v0, v0, v0 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %val = load bfloat, ptr addrspace(1) %out %val.fabs = call bfloat @llvm.fabs.bf16(bfloat %val) %val.fabs.fneg = fneg bfloat %val.fabs @@ -137,22 +196,39 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_bf16(ptr addrspace( } define amdgpu_kernel void @v_test_canonicalize_fneg_var_bf16(ptr addrspace(1) %out) #1 { -; GFX1250-LABEL: v_test_canonicalize_fneg_var_bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_u16 v1, v0, s[0:1] -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_xor_b32_e32 v1, 0x8000, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 -; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; FAKE16-LABEL: v_test_canonicalize_fneg_var_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] +; FAKE16-NEXT: s_wait_loadcnt 0x0 +; FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; FAKE16-NEXT: v_max_num_f32_e32 v1, v1, v1 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; FAKE16-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: v_test_canonicalize_fneg_var_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_load_u16 v0, v1, s[0:1] +; REAL16-NEXT: s_wait_loadcnt 0x0 +; REAL16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; REAL16-NEXT: v_max_num_f32_e32 v0, v0, v0 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %val = load bfloat, ptr addrspace(1) %out %val.fneg = fneg bfloat %val %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val.fneg) @@ -161,22 +237,39 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_bf16(ptr addrspace(1) %o } define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_bf16(ptr addrspace(1) %out) #2 { -; GFX1250-LABEL: v_test_no_denormals_canonicalize_fneg_var_bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_u16 v1, v0, s[0:1] -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_xor_b32_e32 v1, 0x8000, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 -; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; FAKE16-LABEL: v_test_no_denormals_canonicalize_fneg_var_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] +; FAKE16-NEXT: s_wait_loadcnt 0x0 +; FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; FAKE16-NEXT: v_max_num_f32_e32 v1, v1, v1 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; FAKE16-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: v_test_no_denormals_canonicalize_fneg_var_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_load_u16 v0, v1, s[0:1] +; REAL16-NEXT: s_wait_loadcnt 0x0 +; REAL16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; REAL16-NEXT: v_max_num_f32_e32 v0, v0, v0 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %val = load bfloat, ptr addrspace(1) %out %val.fneg = fneg bfloat %val %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val.fneg) @@ -185,22 +278,39 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_bf16(ptr ad } define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_bf16(ptr addrspace(1) %out) #2 { -; GFX1250-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_u16 v1, v0, s[0:1] -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_or_b32_e32 v1, 0x8000, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 -; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; FAKE16-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] +; FAKE16-NEXT: s_wait_loadcnt 0x0 +; FAKE16-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; FAKE16-NEXT: v_max_num_f32_e32 v1, v1, v1 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; FAKE16-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_load_u16 v0, v1, s[0:1] +; REAL16-NEXT: s_wait_loadcnt 0x0 +; REAL16-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; REAL16-NEXT: v_max_num_f32_e32 v0, v0, v0 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %val = load bfloat, ptr addrspace(1) %out %val.fabs = call bfloat @llvm.fabs.bf16(bfloat %val) %val.fabs.fneg = fneg bfloat %val.fabs @@ -210,224 +320,384 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_bf16(p } define amdgpu_kernel void @test_fold_canonicalize_p0_bf16(ptr addrspace(1) %out) #1 { -; GFX1250-LABEL: test_fold_canonicalize_p0_bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_store_b16 v0, v0, s[0:1] -; GFX1250-NEXT: s_endpgm +; FAKE16-LABEL: test_fold_canonicalize_p0_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v0, v0, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: test_fold_canonicalize_p0_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: v_mov_b16_e32 v0.l, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0.0) store bfloat %canonicalized, ptr addrspace(1) %out ret void } define amdgpu_kernel void @test_fold_canonicalize_n0_bf16(ptr addrspace(1) %out) #1 { -; GFX1250-LABEL: test_fold_canonicalize_n0_bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv -; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff8000 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; FAKE16-LABEL: test_fold_canonicalize_n0_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff8000 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: test_fold_canonicalize_n0_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: v_mov_b16_e32 v0.l, 0x8000 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat -0.0) store bfloat %canonicalized, ptr addrspace(1) %out ret void } define amdgpu_kernel void @test_fold_canonicalize_p1_bf16(ptr addrspace(1) %out) #1 { -; GFX1250-LABEL: test_fold_canonicalize_p1_bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv -; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3f80 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; FAKE16-LABEL: test_fold_canonicalize_p1_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3f80 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: test_fold_canonicalize_p1_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: v_mov_b16_e32 v0.l, 0x3f80 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 1.0) store bfloat %canonicalized, ptr addrspace(1) %out ret void } define amdgpu_kernel void @test_fold_canonicalize_n1_bf16(ptr addrspace(1) %out) #1 { -; GFX1250-LABEL: test_fold_canonicalize_n1_bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv -; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffffbf80 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; FAKE16-LABEL: test_fold_canonicalize_n1_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffffbf80 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: test_fold_canonicalize_n1_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: v_mov_b16_e32 v0.l, 0xbf80 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat -1.0) store bfloat %canonicalized, ptr addrspace(1) %out ret void } define amdgpu_kernel void @test_fold_canonicalize_literal_bf16(ptr addrspace(1) %out) #1 { -; GFX1250-LABEL: test_fold_canonicalize_literal_bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv -; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4180 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; FAKE16-LABEL: test_fold_canonicalize_literal_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4180 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: test_fold_canonicalize_literal_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: v_mov_b16_e32 v0.l, 0x4180 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 16.0) store bfloat %canonicalized, ptr addrspace(1) %out ret void } define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_bf16(ptr addrspace(1) %out) #1 { -; GFX1250-LABEL: test_default_denormals_fold_canonicalize_denormal0_bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv -; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; FAKE16-LABEL: test_default_denormals_fold_canonicalize_denormal0_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: test_default_denormals_fold_canonicalize_denormal0_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: v_mov_b16_e32 v0.l, 0x3ff +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR03FF) store bfloat %canonicalized, ptr addrspace(1) %out ret void } define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_bf16(ptr addrspace(1) %out) #3 { -; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal0_bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv -; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; FAKE16-LABEL: test_denormals_fold_canonicalize_denormal0_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: test_denormals_fold_canonicalize_denormal0_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: v_mov_b16_e32 v0.l, 0x3ff +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR03FF) store bfloat %canonicalized, ptr addrspace(1) %out ret void } define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_bf16(ptr addrspace(1) %out) #1 { -; GFX1250-LABEL: test_default_denormals_fold_canonicalize_denormal1_bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv -; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; FAKE16-LABEL: test_default_denormals_fold_canonicalize_denormal1_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: test_default_denormals_fold_canonicalize_denormal1_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: v_mov_b16_e32 v0.l, 0x83ff +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR83FF) store bfloat %canonicalized, ptr addrspace(1) %out ret void } define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_bf16(ptr addrspace(1) %out) #3 { -; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal1_bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv -; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; FAKE16-LABEL: test_denormals_fold_canonicalize_denormal1_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: test_denormals_fold_canonicalize_denormal1_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: v_mov_b16_e32 v0.l, 0x83ff +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR83FF) store bfloat %canonicalized, ptr addrspace(1) %out ret void } define amdgpu_kernel void @test_fold_canonicalize_qnan_bf16(ptr addrspace(1) %out) #1 { -; GFX1250-LABEL: test_fold_canonicalize_qnan_bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv -; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c00 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; FAKE16-LABEL: test_fold_canonicalize_qnan_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c00 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: test_fold_canonicalize_qnan_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: v_mov_b16_e32 v0.l, 0x7c00 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR7C00) store bfloat %canonicalized, ptr addrspace(1) %out ret void } define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_bf16(ptr addrspace(1) %out) #1 { -; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg1_bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv -; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; FAKE16-LABEL: test_fold_canonicalize_qnan_value_neg1_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: test_fold_canonicalize_qnan_value_neg1_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: v_mov_b16_e32 v0.l, 0x7fc0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat bitcast (i16 -1 to bfloat)) store bfloat %canonicalized, ptr addrspace(1) %out ret void } define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_bf16(ptr addrspace(1) %out) #1 { -; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg2_bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv -; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; FAKE16-LABEL: test_fold_canonicalize_qnan_value_neg2_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: test_fold_canonicalize_qnan_value_neg2_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: v_mov_b16_e32 v0.l, 0x7fc0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat bitcast (i16 -2 to bfloat)) store bfloat %canonicalized, ptr addrspace(1) %out ret void } define amdgpu_kernel void @test_fold_canonicalize_snan0_value_bf16(ptr addrspace(1) %out) #1 { -; GFX1250-LABEL: test_fold_canonicalize_snan0_value_bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv -; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c01 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; FAKE16-LABEL: test_fold_canonicalize_snan0_value_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c01 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: test_fold_canonicalize_snan0_value_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: v_mov_b16_e32 v0.l, 0x7c01 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR7C01) store bfloat %canonicalized, ptr addrspace(1) %out ret void } define amdgpu_kernel void @test_fold_canonicalize_snan1_value_bf16(ptr addrspace(1) %out) #1 { -; GFX1250-LABEL: test_fold_canonicalize_snan1_value_bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv -; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7dff -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; FAKE16-LABEL: test_fold_canonicalize_snan1_value_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7dff +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: test_fold_canonicalize_snan1_value_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: v_mov_b16_e32 v0.l, 0x7dff +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR7DFF) store bfloat %canonicalized, ptr addrspace(1) %out ret void } define amdgpu_kernel void @test_fold_canonicalize_snan2_value_bf16(ptr addrspace(1) %out) #1 { -; GFX1250-LABEL: test_fold_canonicalize_snan2_value_bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv -; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfffffdff -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; FAKE16-LABEL: test_fold_canonicalize_snan2_value_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfffffdff +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: test_fold_canonicalize_snan2_value_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: v_mov_b16_e32 v0.l, 0xfdff +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xRFDFF) store bfloat %canonicalized, ptr addrspace(1) %out ret void } define amdgpu_kernel void @test_fold_canonicalize_snan3_value_bf16(ptr addrspace(1) %out) #1 { -; GFX1250-LABEL: test_fold_canonicalize_snan3_value_bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv -; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfffffc01 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; FAKE16-LABEL: test_fold_canonicalize_snan3_value_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfffffc01 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: test_fold_canonicalize_snan3_value_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: v_mov_b16_e32 v0.l, 0xfc01 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xRFC01) store bfloat %canonicalized, ptr addrspace(1) %out ret void @@ -460,27 +730,46 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2bf16(ptr addrspace(1) %out) } define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2bf16(ptr addrspace(1) %out) #1 { -; GFX1250-LABEL: v_test_canonicalize_fabs_var_v2bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv -; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1250-NEXT: v_mov_b32_e32 v2, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX1250-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1250-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 -; GFX1250-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX1250-NEXT: s_endpgm +; FAKE16-LABEL: v_test_canonicalize_fabs_var_v2bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; FAKE16-NEXT: v_mov_b32_e32 v2, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; FAKE16-NEXT: s_wait_loadcnt 0x0 +; FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; FAKE16-NEXT: v_max_num_f32_e32 v1, v1, v1 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; FAKE16-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; FAKE16-NEXT: global_store_b32 v2, v0, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: v_test_canonicalize_fabs_var_v2bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; REAL16-NEXT: v_mov_b16_e32 v1.l, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; REAL16-NEXT: s_wait_loadcnt 0x0 +; REAL16-NEXT: v_and_b16 v1.h, 0x7fff, v0.l +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_max_num_f32_e32 v2, v1, v1 +; REAL16-NEXT: v_and_b16 v1.h, 0x7fff, v0.h +; REAL16-NEXT: v_dual_max_num_f32 v0, v1, v1 :: v_dual_mov_b32 v1, 0 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v2, v0 +; REAL16-NEXT: global_store_b32 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %out, i32 %tid %val = load <2 x bfloat>, ptr addrspace(1) %gep @@ -491,27 +780,46 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2bf16(ptr addrspace(1) } define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2bf16(ptr addrspace(1) %out) #1 { -; GFX1250-LABEL: v_test_canonicalize_fneg_fabs_var_v2bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv -; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1250-NEXT: v_mov_b32_e32 v2, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX1250-NEXT: v_or_b32_e32 v0, 0x8000, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1250-NEXT: v_or_b32_e32 v1, 0x8000, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 -; GFX1250-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX1250-NEXT: s_endpgm +; FAKE16-LABEL: v_test_canonicalize_fneg_fabs_var_v2bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; FAKE16-NEXT: v_mov_b32_e32 v2, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; FAKE16-NEXT: s_wait_loadcnt 0x0 +; FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; FAKE16-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; FAKE16-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; FAKE16-NEXT: v_max_num_f32_e32 v1, v1, v1 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; FAKE16-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; FAKE16-NEXT: global_store_b32 v2, v0, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: v_test_canonicalize_fneg_fabs_var_v2bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; REAL16-NEXT: v_mov_b16_e32 v1.l, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; REAL16-NEXT: s_wait_loadcnt 0x0 +; REAL16-NEXT: v_or_b16 v1.h, 0x8000, v0.l +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_max_num_f32_e32 v2, v1, v1 +; REAL16-NEXT: v_or_b16 v1.h, 0x8000, v0.h +; REAL16-NEXT: v_dual_max_num_f32 v0, v1, v1 :: v_dual_mov_b32 v1, 0 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v2, v0 +; REAL16-NEXT: global_store_b32 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %out, i32 %tid %val = load <2 x bfloat>, ptr addrspace(1) %gep @@ -523,27 +831,46 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2bf16(ptr addrspac } define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2bf16(ptr addrspace(1) %out) #1 { -; GFX1250-LABEL: v_test_canonicalize_fneg_var_v2bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv -; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1250-NEXT: v_mov_b32_e32 v2, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX1250-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1250-NEXT: v_xor_b32_e32 v1, 0x8000, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 -; GFX1250-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX1250-NEXT: s_endpgm +; FAKE16-LABEL: v_test_canonicalize_fneg_var_v2bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; FAKE16-NEXT: v_mov_b32_e32 v2, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; FAKE16-NEXT: s_wait_loadcnt 0x0 +; FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; FAKE16-NEXT: v_max_num_f32_e32 v1, v1, v1 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; FAKE16-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; FAKE16-NEXT: global_store_b32 v2, v0, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: v_test_canonicalize_fneg_var_v2bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; REAL16-NEXT: v_mov_b16_e32 v1.l, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; REAL16-NEXT: s_wait_loadcnt 0x0 +; REAL16-NEXT: v_xor_b16 v1.h, 0x8000, v0.l +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_max_num_f32_e32 v2, v1, v1 +; REAL16-NEXT: v_xor_b16 v1.h, 0x8000, v0.h +; REAL16-NEXT: v_dual_max_num_f32 v0, v1, v1 :: v_dual_mov_b32 v1, 0 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v2, v0 +; REAL16-NEXT: global_store_b32 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %out, i32 %tid %val = load <2 x bfloat>, ptr addrspace(1) %gep @@ -799,19 +1126,34 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2bf16(ptr addrspa } define <3 x bfloat> @v_test_canonicalize_var_v3bf16(<3 x bfloat> %val) #1 { -; GFX1250-LABEL: v_test_canonicalize_var_v3bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v0, v0, v0 -; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; FAKE16-LABEL: v_test_canonicalize_var_v3bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; FAKE16-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; FAKE16-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v0, v0, v0 +; FAKE16-NEXT: v_max_num_f32_e32 v1, v1, v1 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; FAKE16-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; FAKE16-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; REAL16-LABEL: v_test_canonicalize_var_v3bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; REAL16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; REAL16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; REAL16-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2 +; REAL16-NEXT: v_max_num_f32_e32 v0, v0, v0 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; REAL16-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; REAL16-NEXT: s_set_pc_i64 s[30:31] %canonicalized = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> %val) ret <3 x bfloat> %canonicalized } @@ -850,36 +1192,64 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v2bf16(ptr addrspace(1) %ou } define <2 x bfloat> @v_test_canonicalize_reg_undef_v2bf16(bfloat %val) #1 { -; GFX1250-LABEL: v_test_canonicalize_reg_undef_v2bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v0 -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 -; GFX1250-NEXT: s_movk_i32 s0, 0x7fc0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: v_perm_b32 v0, s0, v0, 0x5040100 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; FAKE16-LABEL: v_test_canonicalize_reg_undef_v2bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_max_num_f32_e32 v0, v0, v0 +; FAKE16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; FAKE16-NEXT: s_movk_i32 s0, 0x7fc0 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; FAKE16-NEXT: v_perm_b32 v0, s0, v0, 0x5040100 +; FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; REAL16-LABEL: v_test_canonicalize_reg_undef_v2bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: v_mov_b16_e32 v1.l, 0 +; REAL16-NEXT: v_mov_b16_e32 v1.h, v0.l +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_max_num_f32_e32 v0, v1, v1 +; REAL16-NEXT: v_cvt_pk_bf16_f32 v1, v0, s0 +; REAL16-NEXT: v_mov_b16_e32 v0.h, 0x7fc0 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; REAL16-NEXT: v_mov_b16_e32 v0.l, v1.l +; REAL16-NEXT: s_set_pc_i64 s[30:31] %vec = insertelement <2 x bfloat> poison, bfloat %val, i32 0 %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec) ret <2 x bfloat> %canonicalized } define <2 x bfloat> @v_test_canonicalize_undef_reg_v2bf16(bfloat %val) #1 { -; GFX1250-LABEL: v_test_canonicalize_undef_reg_v2bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v0 -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 -; GFX1250-NEXT: s_movk_i32 s0, 0x7fc0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: v_perm_b32 v0, v0, s0, 0x5040100 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; FAKE16-LABEL: v_test_canonicalize_undef_reg_v2bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_max_num_f32_e32 v0, v0, v0 +; FAKE16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; FAKE16-NEXT: s_movk_i32 s0, 0x7fc0 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; FAKE16-NEXT: v_perm_b32 v0, v0, s0, 0x5040100 +; FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; REAL16-LABEL: v_test_canonicalize_undef_reg_v2bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: v_mov_b16_e32 v1.l, 0 +; REAL16-NEXT: v_mov_b16_e32 v1.h, v0.l +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_max_num_f32_e32 v0, v1, v1 +; REAL16-NEXT: v_cvt_pk_bf16_f32 v1, v0, s0 +; REAL16-NEXT: v_mov_b16_e32 v0.l, 0x7fc0 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; REAL16-NEXT: v_mov_b16_e32 v0.h, v1.l +; REAL16-NEXT: s_set_pc_i64 s[30:31] %vec = insertelement <2 x bfloat> poison, bfloat %val, i32 1 %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec) ret <2 x bfloat> %canonicalized @@ -934,18 +1304,32 @@ define <2 x bfloat> @v_test_canonicalize_k_lo_undef_hi_v2bf16() #1 { } define <2 x bfloat> @v_test_canonicalize_reg_k_v2bf16(bfloat %val) #1 { -; GFX1250-LABEL: v_test_canonicalize_reg_k_v2bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v0 -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 -; GFX1250-NEXT: s_movk_i32 s0, 0x4000 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: v_perm_b32 v0, s0, v0, 0x5040100 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; FAKE16-LABEL: v_test_canonicalize_reg_k_v2bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_max_num_f32_e32 v0, v0, v0 +; FAKE16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; FAKE16-NEXT: s_movk_i32 s0, 0x4000 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; FAKE16-NEXT: v_perm_b32 v0, s0, v0, 0x5040100 +; FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; REAL16-LABEL: v_test_canonicalize_reg_k_v2bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: v_mov_b16_e32 v1.l, 0 +; REAL16-NEXT: v_mov_b16_e32 v1.h, v0.l +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_max_num_f32_e32 v0, v1, v1 +; REAL16-NEXT: v_cvt_pk_bf16_f32 v1, v0, s0 +; REAL16-NEXT: v_mov_b16_e32 v0.h, 0x4000 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; REAL16-NEXT: v_mov_b16_e32 v0.l, v1.l +; REAL16-NEXT: s_set_pc_i64 s[30:31] %vec0 = insertelement <2 x bfloat> poison, bfloat %val, i32 0 %vec1 = insertelement <2 x bfloat> %vec0, bfloat 2.0, i32 1 %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec1) @@ -953,18 +1337,32 @@ define <2 x bfloat> @v_test_canonicalize_reg_k_v2bf16(bfloat %val) #1 { } define <2 x bfloat> @v_test_canonicalize_k_reg_v2bf16(bfloat %val) #1 { -; GFX1250-LABEL: v_test_canonicalize_k_reg_v2bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v0 -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 -; GFX1250-NEXT: s_movk_i32 s0, 0x4000 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: v_perm_b32 v0, v0, s0, 0x5040100 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; FAKE16-LABEL: v_test_canonicalize_k_reg_v2bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_max_num_f32_e32 v0, v0, v0 +; FAKE16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; FAKE16-NEXT: s_movk_i32 s0, 0x4000 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; FAKE16-NEXT: v_perm_b32 v0, v0, s0, 0x5040100 +; FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; REAL16-LABEL: v_test_canonicalize_k_reg_v2bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: v_mov_b16_e32 v1.l, 0 +; REAL16-NEXT: v_mov_b16_e32 v1.h, v0.l +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_max_num_f32_e32 v0, v1, v1 +; REAL16-NEXT: v_cvt_pk_bf16_f32 v1, v0, s0 +; REAL16-NEXT: v_mov_b16_e32 v0.l, 0x4000 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; REAL16-NEXT: v_mov_b16_e32 v0.h, v1.l +; REAL16-NEXT: s_set_pc_i64 s[30:31] %vec0 = insertelement <2 x bfloat> poison, bfloat 2.0, i32 0 %vec1 = insertelement <2 x bfloat> %vec0, bfloat %val, i32 1 %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec1) @@ -988,34 +1386,64 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4bf16(ptr addrspace(1) %ou } define <4 x bfloat> @v_test_canonicalize_reg_undef_undef_undef_v4bf16(bfloat %val) #1 { -; GFX1250-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v1, 0x7fc07fc0 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v0 -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 -; GFX1250-NEXT: s_movk_i32 s0, 0x7fc0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: v_perm_b32 v0, s0, v0, 0x5040100 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; FAKE16-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: v_dual_mov_b32 v1, 0x7fc07fc0 :: v_dual_lshlrev_b32 v0, 16, v0 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_max_num_f32_e32 v0, v0, v0 +; FAKE16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; FAKE16-NEXT: s_movk_i32 s0, 0x7fc0 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; FAKE16-NEXT: v_perm_b32 v0, s0, v0, 0x5040100 +; FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; REAL16-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: v_mov_b16_e32 v1.l, 0 +; REAL16-NEXT: v_mov_b16_e32 v1.h, v0.l +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_max_num_f32_e32 v0, v1, v1 +; REAL16-NEXT: v_cvt_pk_bf16_f32 v1, v0, s0 +; REAL16-NEXT: v_mov_b16_e32 v0.h, 0x7fc0 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; REAL16-NEXT: v_mov_b16_e32 v0.l, v1.l +; REAL16-NEXT: v_mov_b32_e32 v1, 0x7fc07fc0 +; REAL16-NEXT: s_set_pc_i64 s[30:31] %vec = insertelement <4 x bfloat> poison, bfloat %val, i32 0 %canonicalized = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> %vec) ret <4 x bfloat> %canonicalized } define <4 x bfloat> @v_test_canonicalize_reg_reg_undef_undef_v4bf16(bfloat %val0, bfloat %val1) #1 { -; GFX1250-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 -; GFX1250-NEXT: v_mov_b32_e32 v1, 0x7fc07fc0 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; FAKE16-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 +; FAKE16-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; FAKE16-NEXT: v_mov_b32_e32 v1, 0x7fc07fc0 +; FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; REAL16-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: v_mov_b16_e32 v2.l, 0 +; REAL16-NEXT: v_mov_b16_e32 v2.h, v1.l +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_max_num_f32_e32 v1, v2, v2 +; REAL16-NEXT: v_mov_b16_e32 v2.h, v0.l +; REAL16-NEXT: v_max_num_f32_e32 v0, v2, v2 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; REAL16-NEXT: v_mov_b32_e32 v1, 0x7fc07fc0 +; REAL16-NEXT: s_set_pc_i64 s[30:31] %vec0 = insertelement <4 x bfloat> poison, bfloat %val0, i32 0 %vec1 = insertelement <4 x bfloat> %vec0, bfloat %val1, i32 1 %canonicalized = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> %vec1) @@ -1023,21 +1451,41 @@ define <4 x bfloat> @v_test_canonicalize_reg_reg_undef_undef_v4bf16(bfloat %val0 } define <4 x bfloat> @v_test_canonicalize_reg_undef_reg_reg_v4bf16(bfloat %val0, bfloat %val1, bfloat %val2) #1 { -; GFX1250-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4bf16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX1250-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v1, v1, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 -; GFX1250-NEXT: s_movk_i32 s0, 0x7fc0 -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1250-NEXT: v_perm_b32 v0, s0, v0, 0x5040100 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; FAKE16-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; FAKE16-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; FAKE16-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v1, v1, v1 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; FAKE16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; FAKE16-NEXT: s_movk_i32 s0, 0x7fc0 +; FAKE16-NEXT: v_cvt_pk_bf16_f32 v1, v1, v2 +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; FAKE16-NEXT: v_perm_b32 v0, s0, v0, 0x5040100 +; FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; REAL16-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: v_mov_b16_e32 v3.l, 0 +; REAL16-NEXT: v_mov_b16_e32 v3.h, v0.l +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_max_num_f32_e32 v0, v3, v3 +; REAL16-NEXT: v_mov_b16_e32 v3.h, v2.l +; REAL16-NEXT: v_max_num_f32_e32 v2, v3, v3 +; REAL16-NEXT: v_mov_b16_e32 v3.h, v1.l +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; REAL16-NEXT: v_cvt_pk_bf16_f32 v1, v0, s0 +; REAL16-NEXT: v_mov_b16_e32 v0.h, 0x7fc0 +; REAL16-NEXT: v_max_num_f32_e32 v3, v3, v3 +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; REAL16-NEXT: v_mov_b16_e32 v0.l, v1.l +; REAL16-NEXT: v_cvt_pk_bf16_f32 v1, v3, v2 +; REAL16-NEXT: s_set_pc_i64 s[30:31] %vec0 = insertelement <4 x bfloat> poison, bfloat %val0, i32 0 %vec1 = insertelement <4 x bfloat> %vec0, bfloat %val1, i32 2 %vec2 = insertelement <4 x bfloat> %vec1, bfloat %val2, i32 3 diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll index c8fe71958dd16..448a3bcb038c8 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG,GFX1250-SDAG-FAKE16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL,GFX1250-GISEL-FAKE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG,GFX1250-SDAG-TRUE16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL,GFX1250-GISEL-TRUE16 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -mattr=+real-true16,-sramecc < %s | FileCheck -check-prefixes=GFX1250,GFX1250-NOECC,GFX1250-NOECC-SDAG-TRUE16 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -mattr=-real-true16,-sramecc < %s | FileCheck -check-prefixes=GFX1250,GFX1250-NOECC,GFX1250-NOECC-SDAG-FAKE16 %s @@ -2058,21 +2060,39 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_undef_hi_immneg128(ptr in } define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zero_hi(ptr inreg %sbase, i32 %voffset) { -; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_zero_hi: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1250-SDAG-NEXT: ; return to shader part epilog -; -; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_zero_hi: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-SDAG-FAKE16-LABEL: flat_load_saddr_i16_d16lo_zero_hi: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-FAKE16-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-FAKE16-LABEL: flat_load_saddr_i16_d16lo_zero_hi: +; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-GISEL-FAKE16-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16lo_zero_hi: +; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-TRUE16-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-TRUE16-LABEL: flat_load_saddr_i16_d16lo_zero_hi: +; GFX1250-GISEL-TRUE16: ; %bb.0: +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-GISEL-TRUE16-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, 0, 16, v0 +; GFX1250-GISEL-TRUE16-NEXT: ; return to shader part epilog ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_zero_hi: ; GFX1250-NOECC: ; %bb.0: @@ -2091,21 +2111,39 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zero_hi(ptr inreg %sbase, } define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zero_hi_immneg128(ptr inreg %sbase, i32 %voffset) { -; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_zero_hi_immneg128: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1250-SDAG-NEXT: ; return to shader part epilog -; -; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_zero_hi_immneg128: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-SDAG-FAKE16-LABEL: flat_load_saddr_i16_d16lo_zero_hi_immneg128: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-FAKE16-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-FAKE16-LABEL: flat_load_saddr_i16_d16lo_zero_hi_immneg128: +; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-GISEL-FAKE16-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16lo_zero_hi_immneg128: +; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-TRUE16-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-TRUE16-LABEL: flat_load_saddr_i16_d16lo_zero_hi_immneg128: +; GFX1250-GISEL-TRUE16: ; %bb.0: +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-GISEL-TRUE16-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, 0, 16, v0 +; GFX1250-GISEL-TRUE16-NEXT: ; return to shader part epilog ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_zero_hi_immneg128: ; GFX1250-NOECC: ; %bb.0: @@ -2125,13 +2163,13 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zero_hi_immneg128(ptr inr } define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { -; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_reg_hi: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 -; GFX1250-SDAG-NEXT: ; return to shader part epilog +; GFX1250-SDAG-FAKE16-LABEL: flat_load_saddr_i16_d16lo_reg_hi: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-FAKE16-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_reg_hi: ; GFX1250-GISEL: ; %bb.0: @@ -2141,6 +2179,14 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi(ptr inreg %sbase, ; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 ; GFX1250-GISEL-NEXT: ; return to shader part epilog ; +; GFX1250-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16lo_reg_hi: +; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-TRUE16-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.h +; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog +; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_reg_hi: ; GFX1250-NOECC: ; %bb.0: ; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 @@ -2157,13 +2203,13 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi(ptr inreg %sbase, } define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { -; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_reg_hi_immneg128: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 -; GFX1250-SDAG-NEXT: ; return to shader part epilog +; GFX1250-SDAG-FAKE16-LABEL: flat_load_saddr_i16_d16lo_reg_hi_immneg128: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-FAKE16-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_reg_hi_immneg128: ; GFX1250-GISEL: ; %bb.0: @@ -2173,6 +2219,14 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi_immneg128(ptr inre ; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 ; GFX1250-GISEL-NEXT: ; return to shader part epilog ; +; GFX1250-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16lo_reg_hi_immneg128: +; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-TRUE16-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.h +; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog +; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_reg_hi_immneg128: ; GFX1250-NOECC: ; %bb.0: ; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 @@ -2190,13 +2244,13 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi_immneg128(ptr inre } define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { -; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 -; GFX1250-SDAG-NEXT: ; return to shader part epilog +; GFX1250-SDAG-FAKE16-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-FAKE16-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi: ; GFX1250-GISEL: ; %bb.0: @@ -2206,6 +2260,14 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi(ptr inreg % ; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 ; GFX1250-GISEL-NEXT: ; return to shader part epilog ; +; GFX1250-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi: +; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-TRUE16-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.h +; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog +; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi: ; GFX1250-NOECC: ; %bb.0: ; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 @@ -2223,13 +2285,13 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi(ptr inreg % } define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { -; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-128 -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 -; GFX1250-SDAG-NEXT: ; return to shader part epilog +; GFX1250-SDAG-FAKE16-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-FAKE16-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: ; GFX1250-GISEL: ; %bb.0: @@ -2239,6 +2301,14 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128(p ; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 ; GFX1250-GISEL-NEXT: ; return to shader part epilog ; +; GFX1250-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: +; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-TRUE16-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.h +; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog +; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: ; GFX1250-NOECC: ; %bb.0: ; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 @@ -2257,13 +2327,13 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128(p } define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { -; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-SDAG-NEXT: flat_load_i8 v0, v0, s[2:3] -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 -; GFX1250-SDAG-NEXT: ; return to shader part epilog +; GFX1250-SDAG-FAKE16-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-FAKE16-NEXT: flat_load_i8 v0, v0, s[2:3] +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi: ; GFX1250-GISEL: ; %bb.0: @@ -2275,6 +2345,14 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi(ptr inreg % ; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 ; GFX1250-GISEL-NEXT: ; return to shader part epilog ; +; GFX1250-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi: +; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-TRUE16-NEXT: flat_load_i8 v0, v0, s[2:3] +; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.h +; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog +; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi: ; GFX1250-NOECC: ; %bb.0: ; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 @@ -2292,13 +2370,13 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi(ptr inreg % } define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { -; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-SDAG-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128 -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 -; GFX1250-SDAG-NEXT: ; return to shader part epilog +; GFX1250-SDAG-FAKE16-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-FAKE16-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: ; GFX1250-GISEL: ; %bb.0: @@ -2310,6 +2388,14 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(p ; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 ; GFX1250-GISEL-NEXT: ; return to shader part epilog ; +; GFX1250-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: +; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-TRUE16-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.h +; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog +; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: ; GFX1250-NOECC: ; %bb.0: ; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 @@ -2332,21 +2418,37 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(p ; -------------------------------------------------------------------------------- define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_undef_hi(ptr inreg %sbase, i32 %voffset) { -; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_undef_hi: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1250-SDAG-NEXT: ; return to shader part epilog -; -; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_undef_hi: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-SDAG-FAKE16-LABEL: flat_load_saddr_i16_d16hi_undef_hi: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-FAKE16-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-FAKE16-LABEL: flat_load_saddr_i16_d16hi_undef_hi: +; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-GISEL-FAKE16-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16hi_undef_hi: +; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-TRUE16-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-TRUE16-LABEL: flat_load_saddr_i16_d16hi_undef_hi: +; GFX1250-GISEL-TRUE16: ; %bb.0: +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-GISEL-TRUE16-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX1250-GISEL-TRUE16-NEXT: ; return to shader part epilog ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16hi_undef_hi: ; GFX1250-NOECC: ; %bb.0: @@ -2363,21 +2465,37 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_undef_hi(ptr inreg %sbase } define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_undef_hi_immneg128(ptr inreg %sbase, i32 %voffset) { -; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_undef_hi_immneg128: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1250-SDAG-NEXT: ; return to shader part epilog -; -; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_undef_hi_immneg128: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-SDAG-FAKE16-LABEL: flat_load_saddr_i16_d16hi_undef_hi_immneg128: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-FAKE16-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-FAKE16-LABEL: flat_load_saddr_i16_d16hi_undef_hi_immneg128: +; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-GISEL-FAKE16-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16hi_undef_hi_immneg128: +; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-TRUE16-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-TRUE16-LABEL: flat_load_saddr_i16_d16hi_undef_hi_immneg128: +; GFX1250-GISEL-TRUE16: ; %bb.0: +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-GISEL-TRUE16-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX1250-GISEL-TRUE16-NEXT: ; return to shader part epilog ; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16hi_undef_hi_immneg128: ; GFX1250-NOECC: ; %bb.0: @@ -2395,21 +2513,39 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_undef_hi_immneg128(ptr in } define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi(ptr inreg %sbase, i32 %voffset) { -; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zero_hi: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, 0, 0x5040100 -; GFX1250-SDAG-NEXT: ; return to shader part epilog -; -; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zero_hi: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-SDAG-FAKE16-LABEL: flat_load_saddr_i16_d16hi_zero_hi: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-FAKE16-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, 0, 0x5040100 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-FAKE16-LABEL: flat_load_saddr_i16_d16hi_zero_hi: +; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-GISEL-FAKE16-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16hi_zero_hi: +; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-TRUE16-NEXT: flat_load_u16 v1, v0, s[2:3] +; GFX1250-SDAG-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-TRUE16-LABEL: flat_load_saddr_i16_d16hi_zero_hi: +; GFX1250-GISEL-TRUE16: ; %bb.0: +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-GISEL-TRUE16-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, 0 +; GFX1250-GISEL-TRUE16-NEXT: ; return to shader part epilog ; ; GFX1250-NOECC-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16hi_zero_hi: ; GFX1250-NOECC-SDAG-TRUE16: ; %bb.0: @@ -2437,21 +2573,39 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi(ptr inreg %sbase, } define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi_immneg128(ptr inreg %sbase, i32 %voffset) { -; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zero_hi_immneg128: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, 0, 0x5040100 -; GFX1250-SDAG-NEXT: ; return to shader part epilog -; -; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zero_hi_immneg128: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-SDAG-FAKE16-LABEL: flat_load_saddr_i16_d16hi_zero_hi_immneg128: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-FAKE16-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, 0, 0x5040100 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-FAKE16-LABEL: flat_load_saddr_i16_d16hi_zero_hi_immneg128: +; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-GISEL-FAKE16-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16hi_zero_hi_immneg128: +; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-TRUE16-NEXT: flat_load_u16 v1, v0, s[2:3] offset:-128 +; GFX1250-SDAG-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-TRUE16-LABEL: flat_load_saddr_i16_d16hi_zero_hi_immneg128: +; GFX1250-GISEL-TRUE16: ; %bb.0: +; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-GISEL-TRUE16-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, 0 +; GFX1250-GISEL-TRUE16-NEXT: ; return to shader part epilog ; ; GFX1250-NOECC-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16hi_zero_hi_immneg128: ; GFX1250-NOECC-SDAG-TRUE16: ; %bb.0: @@ -2480,13 +2634,13 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi_immneg128(ptr inr } define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { -; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_reg_hi: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 -; GFX1250-SDAG-NEXT: ; return to shader part epilog +; GFX1250-SDAG-FAKE16-LABEL: flat_load_saddr_i16_d16hi_reg_hi: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-FAKE16-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_reg_hi: ; GFX1250-GISEL: ; %bb.0: @@ -2498,6 +2652,16 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi(ptr inreg %sbase, ; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GFX1250-GISEL-NEXT: ; return to shader part epilog ; +; GFX1250-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16hi_reg_hi: +; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-TRUE16-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX1250-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog +; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16hi_reg_hi: ; GFX1250-NOECC: ; %bb.0: ; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 @@ -2514,13 +2678,13 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi(ptr inreg %sbase, } define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { -; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_reg_hi_immneg128: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 -; GFX1250-SDAG-NEXT: ; return to shader part epilog +; GFX1250-SDAG-FAKE16-LABEL: flat_load_saddr_i16_d16hi_reg_hi_immneg128: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-FAKE16-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_reg_hi_immneg128: ; GFX1250-GISEL: ; %bb.0: @@ -2532,6 +2696,16 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi_immneg128(ptr inre ; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GFX1250-GISEL-NEXT: ; return to shader part epilog ; +; GFX1250-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16hi_reg_hi_immneg128: +; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-TRUE16-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX1250-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog +; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16hi_reg_hi_immneg128: ; GFX1250-NOECC: ; %bb.0: ; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 @@ -2549,13 +2723,13 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi_immneg128(ptr inre } define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { -; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 -; GFX1250-SDAG-NEXT: ; return to shader part epilog +; GFX1250-SDAG-FAKE16-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-FAKE16-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi: ; GFX1250-GISEL: ; %bb.0: @@ -2567,6 +2741,16 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi(ptr inreg % ; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GFX1250-GISEL-NEXT: ; return to shader part epilog ; +; GFX1250-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi: +; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-TRUE16-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX1250-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog +; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi: ; GFX1250-NOECC: ; %bb.0: ; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 @@ -2584,13 +2768,13 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi(ptr inreg % } define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { -; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-128 -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 -; GFX1250-SDAG-NEXT: ; return to shader part epilog +; GFX1250-SDAG-FAKE16-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-FAKE16-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: ; GFX1250-GISEL: ; %bb.0: @@ -2602,6 +2786,16 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128(p ; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GFX1250-GISEL-NEXT: ; return to shader part epilog ; +; GFX1250-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: +; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-TRUE16-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX1250-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog +; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: ; GFX1250-NOECC: ; %bb.0: ; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 @@ -2620,13 +2814,13 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128(p } define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { -; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-SDAG-NEXT: flat_load_i8 v0, v0, s[2:3] -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 -; GFX1250-SDAG-NEXT: ; return to shader part epilog +; GFX1250-SDAG-FAKE16-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-FAKE16-NEXT: flat_load_i8 v0, v0, s[2:3] +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi: ; GFX1250-GISEL: ; %bb.0: @@ -2639,6 +2833,16 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi(ptr inreg % ; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GFX1250-GISEL-NEXT: ; return to shader part epilog ; +; GFX1250-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi: +; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-TRUE16-NEXT: flat_load_i8 v0, v0, s[2:3] +; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX1250-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog +; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi: ; GFX1250-NOECC: ; %bb.0: ; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 @@ -2656,13 +2860,13 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi(ptr inreg % } define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { -; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-SDAG-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128 -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 -; GFX1250-SDAG-NEXT: ; return to shader part epilog +; GFX1250-SDAG-FAKE16-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-FAKE16-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: ; GFX1250-GISEL: ; %bb.0: @@ -2675,6 +2879,16 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(p ; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GFX1250-GISEL-NEXT: ; return to shader part epilog ; +; GFX1250-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: +; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-TRUE16-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX1250-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog +; ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: ; GFX1250-NOECC: ; %bb.0: ; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.v2f16.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.v2f16.ll index 852c9cf8face8..b1bb6796e6d48 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum3.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.v2f16.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-SDAG,GFX1250-SDAG-FAKE16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-GISEL,GFX1250-GISEL-FAKE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-SDAG,GFX1250-SDAG-REAL16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-GISEL,GFX1250-GISEL-REAL16 %s define <2 x half> @fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) { ; GFX1250-SDAG-LABEL: fmaximum3_v2f16: @@ -55,16 +57,27 @@ define <3 x half> @fmaximum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c) ; GFX1250-SDAG-NEXT: v_pk_maximum3_f16 v1, v5, v1, v3 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; -; GFX1250-GISEL-LABEL: fmaximum3_v3f16: -; GFX1250-GISEL: ; %bb.0: ; %entry -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 -; GFX1250-GISEL-NEXT: v_maximum_f16 v1, v1, v3 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-GISEL-NEXT: v_pk_maximum3_f16 v0, v4, v0, v0 -; GFX1250-GISEL-NEXT: v_maximum_f16 v1, v5, v1 -; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-GISEL-FAKE16-LABEL: fmaximum3_v3f16: +; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX1250-GISEL-FAKE16-NEXT: v_maximum_f16 v1, v1, v3 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_pk_maximum3_f16 v0, v4, v0, v0 +; GFX1250-GISEL-FAKE16-NEXT: v_maximum_f16 v1, v5, v1 +; GFX1250-GISEL-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-REAL16-LABEL: fmaximum3_v3f16: +; GFX1250-GISEL-REAL16: ; %bb.0: ; %entry +; GFX1250-GISEL-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX1250-GISEL-REAL16-NEXT: v_maximum_f16 v1.l, v1.l, v3.l +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_pk_maximum3_f16 v0, v4, v0, v0 +; GFX1250-GISEL-REAL16-NEXT: v_maximum_f16 v1.l, v5.l, v1.l +; GFX1250-GISEL-REAL16-NEXT: s_set_pc_i64 s[30:31] entry: %min = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b) %res = call <3 x half> @llvm.maximum.v3f16(<3 x half> %c, <3 x half> %min) @@ -97,3 +110,5 @@ entry: } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GFX1250: {{.*}} +; GFX1250-SDAG-FAKE16: {{.*}} +; GFX1250-SDAG-REAL16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.v2f16.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.v2f16.ll index df9fb10badb9a..78d9bb15d3835 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum3.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum3.v2f16.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-SDAG,GFX1250-SDAG-FAKE16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-GISEL,GFX1250-GISEL-FAKE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-SDAG,GFX1250-SDAG-REAL16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-GISEL,GFX1250-GISEL-REAL16 %s define <2 x half> @fminimum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) { ; GFX1250-SDAG-LABEL: fminimum3_v2f16: @@ -55,16 +57,27 @@ define <3 x half> @fminimum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c) ; GFX1250-SDAG-NEXT: v_pk_minimum3_f16 v1, v5, v1, v3 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; -; GFX1250-GISEL-LABEL: fminimum3_v3f16: -; GFX1250-GISEL: ; %bb.0: ; %entry -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 -; GFX1250-GISEL-NEXT: v_minimum_f16 v1, v1, v3 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-GISEL-NEXT: v_pk_minimum3_f16 v0, v4, v0, v0 -; GFX1250-GISEL-NEXT: v_minimum_f16 v1, v5, v1 -; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-GISEL-FAKE16-LABEL: fminimum3_v3f16: +; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX1250-GISEL-FAKE16-NEXT: v_minimum_f16 v1, v1, v3 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_pk_minimum3_f16 v0, v4, v0, v0 +; GFX1250-GISEL-FAKE16-NEXT: v_minimum_f16 v1, v5, v1 +; GFX1250-GISEL-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-REAL16-LABEL: fminimum3_v3f16: +; GFX1250-GISEL-REAL16: ; %bb.0: ; %entry +; GFX1250-GISEL-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX1250-GISEL-REAL16-NEXT: v_minimum_f16 v1.l, v1.l, v3.l +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_pk_minimum3_f16 v0, v4, v0, v0 +; GFX1250-GISEL-REAL16-NEXT: v_minimum_f16 v1.l, v5.l, v1.l +; GFX1250-GISEL-REAL16-NEXT: s_set_pc_i64 s[30:31] entry: %min = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b) %res = call <3 x half> @llvm.minimum.v3f16(<3 x half> %c, <3 x half> %min) @@ -97,3 +110,5 @@ entry: } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GFX1250: {{.*}} +; GFX1250-SDAG-FAKE16: {{.*}} +; GFX1250-SDAG-REAL16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll index fc064e24d74e7..a707de96eb116 100644 --- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll @@ -1,31 +1,33 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-SDAG %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,GCN-SDAG,GCN-SDAG-FAKE16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,GCN-GISEL,GCN-GISEL-FAKE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,GCN-SDAG,GCN-SDAG-REAL16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,GCN-GISEL,GCN-GISEL-REAL16 %s ; Test S_WAIT_XCNT insertion for global_load/store clauses. ; Introduced additional operations in between the clauses to have the register dependency ; between the operands of VMEM operations and the def ops of VALU instructions that followed. define void @test_i8load_v4i8store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %ptr_b, ptr addrspace(1) %ptr_c, ptr addrspace(1) %ptr_d, ptr addrspace(1) %out) { -; GCN-SDAG-LABEL: test_i8load_v4i8store: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GCN-SDAG-NEXT: s_wait_kmcnt 0x0 -; GCN-SDAG-NEXT: global_load_u8 v6, v[4:5], off -; GCN-SDAG-NEXT: global_load_u8 v7, v[2:3], off -; GCN-SDAG-NEXT: global_load_u8 v10, v[0:1], off -; GCN-SDAG-NEXT: s_wait_loadcnt 0x2 -; GCN-SDAG-NEXT: s_wait_xcnt 0x0 -; GCN-SDAG-NEXT: v_lshlrev_b16 v0, 8, v6 -; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 -; GCN-SDAG-NEXT: v_perm_b32 v1, v10, v7, 0xc0c0004 -; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GCN-SDAG-NEXT: v_or_b32_e32 v0, v6, v0 -; GCN-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-SDAG-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-SDAG-NEXT: global_store_b32 v[8:9], v0, off -; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31] +; GCN-SDAG-FAKE16-LABEL: test_i8load_v4i8store: +; GCN-SDAG-FAKE16: ; %bb.0: +; GCN-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GCN-SDAG-FAKE16-NEXT: global_load_u8 v6, v[4:5], off +; GCN-SDAG-FAKE16-NEXT: global_load_u8 v7, v[2:3], off +; GCN-SDAG-FAKE16-NEXT: global_load_u8 v10, v[0:1], off +; GCN-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x2 +; GCN-SDAG-FAKE16-NEXT: s_wait_xcnt 0x0 +; GCN-SDAG-FAKE16-NEXT: v_lshlrev_b16 v0, 8, v6 +; GCN-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GCN-SDAG-FAKE16-NEXT: v_perm_b32 v1, v10, v7, 0xc0c0004 +; GCN-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GCN-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v6, v0 +; GCN-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-SDAG-FAKE16-NEXT: global_store_b32 v[8:9], v0, off +; GCN-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] ; ; GCN-GISEL-LABEL: test_i8load_v4i8store: ; GCN-GISEL: ; %bb.0: @@ -44,6 +46,25 @@ define void @test_i8load_v4i8store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-GISEL-NEXT: v_or3_b32 v0, v0, v1, v2 ; GCN-GISEL-NEXT: global_store_b32 v[8:9], v0, off ; GCN-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GCN-SDAG-REAL16-LABEL: test_i8load_v4i8store: +; GCN-SDAG-REAL16: ; %bb.0: +; GCN-SDAG-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 +; GCN-SDAG-REAL16-NEXT: global_load_u8 v6, v[4:5], off +; GCN-SDAG-REAL16-NEXT: global_load_u8 v7, v[2:3], off +; GCN-SDAG-REAL16-NEXT: global_load_u8 v10, v[0:1], off +; GCN-SDAG-REAL16-NEXT: s_wait_loadcnt 0x2 +; GCN-SDAG-REAL16-NEXT: s_wait_xcnt 0x0 +; GCN-SDAG-REAL16-NEXT: v_lshlrev_b16 v0.l, 8, v6.l +; GCN-SDAG-REAL16-NEXT: s_wait_loadcnt 0x0 +; GCN-SDAG-REAL16-NEXT: v_perm_b32 v1, v10, v7, 0xc0c0004 +; GCN-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GCN-SDAG-REAL16-NEXT: v_or_b16 v0.h, v6.l, v0.l +; GCN-SDAG-REAL16-NEXT: v_mov_b16_e32 v0.l, 0 +; GCN-SDAG-REAL16-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-SDAG-REAL16-NEXT: global_store_b32 v[8:9], v0, off +; GCN-SDAG-REAL16-NEXT: s_set_pc_i64 s[30:31] %a = load i8, ptr addrspace(1) %ptr_a %b = load i8, ptr addrspace(1) %ptr_b %c = load i8, ptr addrspace(1) %ptr_c @@ -57,58 +78,111 @@ define void @test_i8load_v4i8store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt } define i16 @test_v7i16_load_store(ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr2) { -; GCN-SDAG-LABEL: test_v7i16_load_store: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GCN-SDAG-NEXT: s_wait_kmcnt 0x0 -; GCN-SDAG-NEXT: global_load_b128 v[4:7], v[0:1], off -; GCN-SDAG-NEXT: global_load_b128 v[8:11], v[2:3], off -; GCN-SDAG-NEXT: v_mov_b64_e32 v[12:13], 0 -; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 -; GCN-SDAG-NEXT: s_wait_xcnt 0x1 -; GCN-SDAG-NEXT: v_pk_add_u16 v1, v6, v10 -; GCN-SDAG-NEXT: s_wait_xcnt 0x0 -; GCN-SDAG-NEXT: v_pk_add_u16 v3, v5, v9 -; GCN-SDAG-NEXT: v_pk_add_u16 v5, v7, v11 -; GCN-SDAG-NEXT: v_mov_b64_e32 v[6:7], 12 -; GCN-SDAG-NEXT: v_mov_b64_e32 v[10:11], 8 -; GCN-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GCN-SDAG-NEXT: v_pk_add_u16 v2, v4, v8 -; GCN-SDAG-NEXT: s_clause 0x2 -; GCN-SDAG-NEXT: global_store_b16 v[6:7], v5, off -; GCN-SDAG-NEXT: global_store_b32 v[10:11], v1, off -; GCN-SDAG-NEXT: global_store_b64 v[12:13], v[2:3], off -; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31] +; GCN-SDAG-FAKE16-LABEL: test_v7i16_load_store: +; GCN-SDAG-FAKE16: ; %bb.0: +; GCN-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GCN-SDAG-FAKE16-NEXT: global_load_b128 v[4:7], v[0:1], off +; GCN-SDAG-FAKE16-NEXT: global_load_b128 v[8:11], v[2:3], off +; GCN-SDAG-FAKE16-NEXT: v_mov_b64_e32 v[12:13], 0 +; GCN-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GCN-SDAG-FAKE16-NEXT: s_wait_xcnt 0x1 +; GCN-SDAG-FAKE16-NEXT: v_pk_add_u16 v1, v6, v10 +; GCN-SDAG-FAKE16-NEXT: s_wait_xcnt 0x0 +; GCN-SDAG-FAKE16-NEXT: v_pk_add_u16 v3, v5, v9 +; GCN-SDAG-FAKE16-NEXT: v_pk_add_u16 v5, v7, v11 +; GCN-SDAG-FAKE16-NEXT: v_mov_b64_e32 v[6:7], 12 +; GCN-SDAG-FAKE16-NEXT: v_mov_b64_e32 v[10:11], 8 +; GCN-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GCN-SDAG-FAKE16-NEXT: v_pk_add_u16 v2, v4, v8 +; GCN-SDAG-FAKE16-NEXT: s_clause 0x2 +; GCN-SDAG-FAKE16-NEXT: global_store_b16 v[6:7], v5, off +; GCN-SDAG-FAKE16-NEXT: global_store_b32 v[10:11], v1, off +; GCN-SDAG-FAKE16-NEXT: global_store_b64 v[12:13], v[2:3], off +; GCN-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] ; -; GCN-GISEL-LABEL: test_v7i16_load_store: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GCN-GISEL-NEXT: s_wait_kmcnt 0x0 -; GCN-GISEL-NEXT: global_load_b128 v[4:7], v[0:1], off -; GCN-GISEL-NEXT: global_load_b128 v[8:11], v[2:3], off -; GCN-GISEL-NEXT: s_wait_xcnt 0x0 -; GCN-GISEL-NEXT: v_mov_b64_e32 v[2:3], 0 -; GCN-GISEL-NEXT: v_mov_b64_e32 v[12:13], 2 -; GCN-GISEL-NEXT: v_mov_b64_e32 v[14:15], 4 -; GCN-GISEL-NEXT: v_mov_b64_e32 v[16:17], 6 -; GCN-GISEL-NEXT: v_mov_b64_e32 v[18:19], 8 -; GCN-GISEL-NEXT: v_mov_b64_e32 v[20:21], 10 -; GCN-GISEL-NEXT: v_mov_b64_e32 v[22:23], 12 -; GCN-GISEL-NEXT: s_wait_loadcnt 0x0 -; GCN-GISEL-NEXT: v_pk_add_u16 v1, v6, v10 -; GCN-GISEL-NEXT: v_pk_add_u16 v4, v4, v8 -; GCN-GISEL-NEXT: v_pk_add_u16 v5, v5, v9 -; GCN-GISEL-NEXT: v_pk_add_u16 v6, v7, v11 -; GCN-GISEL-NEXT: s_clause 0x6 -; GCN-GISEL-NEXT: global_store_b16 v[2:3], v4, off -; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[12:13], v4, off -; GCN-GISEL-NEXT: global_store_b16 v[14:15], v5, off -; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[16:17], v5, off -; GCN-GISEL-NEXT: global_store_b16 v[18:19], v1, off -; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[20:21], v1, off -; GCN-GISEL-NEXT: global_store_b16 v[22:23], v6, off -; GCN-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GCN-GISEL-NEXT: s_set_pc_i64 s[30:31] +; GCN-GISEL-FAKE16-LABEL: test_v7i16_load_store: +; GCN-GISEL-FAKE16: ; %bb.0: +; GCN-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GCN-GISEL-FAKE16-NEXT: global_load_b128 v[4:7], v[0:1], off +; GCN-GISEL-FAKE16-NEXT: global_load_b128 v[8:11], v[2:3], off +; GCN-GISEL-FAKE16-NEXT: s_wait_xcnt 0x0 +; GCN-GISEL-FAKE16-NEXT: v_mov_b64_e32 v[2:3], 0 +; GCN-GISEL-FAKE16-NEXT: v_mov_b64_e32 v[12:13], 2 +; GCN-GISEL-FAKE16-NEXT: v_mov_b64_e32 v[14:15], 4 +; GCN-GISEL-FAKE16-NEXT: v_mov_b64_e32 v[16:17], 6 +; GCN-GISEL-FAKE16-NEXT: v_mov_b64_e32 v[18:19], 8 +; GCN-GISEL-FAKE16-NEXT: v_mov_b64_e32 v[20:21], 10 +; GCN-GISEL-FAKE16-NEXT: v_mov_b64_e32 v[22:23], 12 +; GCN-GISEL-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GCN-GISEL-FAKE16-NEXT: v_pk_add_u16 v1, v6, v10 +; GCN-GISEL-FAKE16-NEXT: v_pk_add_u16 v4, v4, v8 +; GCN-GISEL-FAKE16-NEXT: v_pk_add_u16 v5, v5, v9 +; GCN-GISEL-FAKE16-NEXT: v_pk_add_u16 v6, v7, v11 +; GCN-GISEL-FAKE16-NEXT: s_clause 0x6 +; GCN-GISEL-FAKE16-NEXT: global_store_b16 v[2:3], v4, off +; GCN-GISEL-FAKE16-NEXT: global_store_d16_hi_b16 v[12:13], v4, off +; GCN-GISEL-FAKE16-NEXT: global_store_b16 v[14:15], v5, off +; GCN-GISEL-FAKE16-NEXT: global_store_d16_hi_b16 v[16:17], v5, off +; GCN-GISEL-FAKE16-NEXT: global_store_b16 v[18:19], v1, off +; GCN-GISEL-FAKE16-NEXT: global_store_d16_hi_b16 v[20:21], v1, off +; GCN-GISEL-FAKE16-NEXT: global_store_b16 v[22:23], v6, off +; GCN-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GCN-GISEL-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GCN-SDAG-REAL16-LABEL: test_v7i16_load_store: +; GCN-SDAG-REAL16: ; %bb.0: +; GCN-SDAG-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 +; GCN-SDAG-REAL16-NEXT: global_load_b128 v[4:7], v[0:1], off +; GCN-SDAG-REAL16-NEXT: global_load_b128 v[8:11], v[2:3], off +; GCN-SDAG-REAL16-NEXT: s_wait_xcnt 0x0 +; GCN-SDAG-REAL16-NEXT: v_mov_b64_e32 v[2:3], 12 +; GCN-SDAG-REAL16-NEXT: v_mov_b64_e32 v[12:13], 8 +; GCN-SDAG-REAL16-NEXT: v_mov_b64_e32 v[14:15], 0 +; GCN-SDAG-REAL16-NEXT: s_wait_loadcnt 0x0 +; GCN-SDAG-REAL16-NEXT: v_pk_add_u16 v1, v7, v11 +; GCN-SDAG-REAL16-NEXT: v_pk_add_u16 v0, v6, v10 +; GCN-SDAG-REAL16-NEXT: v_pk_add_u16 v5, v5, v9 +; GCN-SDAG-REAL16-NEXT: v_pk_add_u16 v4, v4, v8 +; GCN-SDAG-REAL16-NEXT: s_clause 0x2 +; GCN-SDAG-REAL16-NEXT: global_store_b16 v[2:3], v1, off +; GCN-SDAG-REAL16-NEXT: global_store_b32 v[12:13], v0, off +; GCN-SDAG-REAL16-NEXT: global_store_b64 v[14:15], v[4:5], off +; GCN-SDAG-REAL16-NEXT: s_wait_xcnt 0x1 +; GCN-SDAG-REAL16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GCN-SDAG-REAL16-NEXT: s_set_pc_i64 s[30:31] +; +; GCN-GISEL-REAL16-LABEL: test_v7i16_load_store: +; GCN-GISEL-REAL16: ; %bb.0: +; GCN-GISEL-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GCN-GISEL-REAL16-NEXT: global_load_b128 v[4:7], v[0:1], off +; GCN-GISEL-REAL16-NEXT: global_load_b128 v[8:11], v[2:3], off +; GCN-GISEL-REAL16-NEXT: s_wait_xcnt 0x0 +; GCN-GISEL-REAL16-NEXT: v_mov_b64_e32 v[2:3], 0 +; GCN-GISEL-REAL16-NEXT: v_mov_b64_e32 v[12:13], 2 +; GCN-GISEL-REAL16-NEXT: v_mov_b64_e32 v[14:15], 4 +; GCN-GISEL-REAL16-NEXT: v_mov_b64_e32 v[16:17], 6 +; GCN-GISEL-REAL16-NEXT: v_mov_b64_e32 v[18:19], 8 +; GCN-GISEL-REAL16-NEXT: v_mov_b64_e32 v[20:21], 10 +; GCN-GISEL-REAL16-NEXT: v_mov_b64_e32 v[22:23], 12 +; GCN-GISEL-REAL16-NEXT: s_wait_loadcnt 0x0 +; GCN-GISEL-REAL16-NEXT: v_pk_add_u16 v1, v6, v10 +; GCN-GISEL-REAL16-NEXT: v_pk_add_u16 v4, v4, v8 +; GCN-GISEL-REAL16-NEXT: v_pk_add_u16 v5, v5, v9 +; GCN-GISEL-REAL16-NEXT: v_pk_add_u16 v6, v7, v11 +; GCN-GISEL-REAL16-NEXT: s_clause 0x6 +; GCN-GISEL-REAL16-NEXT: global_store_b16 v[2:3], v4, off +; GCN-GISEL-REAL16-NEXT: global_store_d16_hi_b16 v[12:13], v4, off +; GCN-GISEL-REAL16-NEXT: global_store_b16 v[14:15], v5, off +; GCN-GISEL-REAL16-NEXT: global_store_d16_hi_b16 v[16:17], v5, off +; GCN-GISEL-REAL16-NEXT: global_store_b16 v[18:19], v1, off +; GCN-GISEL-REAL16-NEXT: global_store_d16_hi_b16 v[20:21], v1, off +; GCN-GISEL-REAL16-NEXT: global_store_b16 v[22:23], v6, off +; GCN-GISEL-REAL16-NEXT: v_mov_b16_e32 v0.l, v1.h +; GCN-GISEL-REAL16-NEXT: s_set_pc_i64 s[30:31] %vec1 = load <7 x i16>, ptr addrspace(1) %ptr1 %insert = insertelement <7 x i16> %vec1, i16 20, i32 4 %vec2 = load <7 x i16>, ptr addrspace(1) %ptr2 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll index a6ab80848e564..3d91e82f1b357 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll @@ -3,7 +3,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck -check-prefix=VI %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX900 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { ; SI-LABEL: s_insertelement_v2bf16_0: @@ -222,19 +223,31 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; GFX942-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NEXT: s_endpgm ; -; GFX1250-LABEL: v_insertelement_v2bf16_0: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv -; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_movk_i32 s2, 0x40a0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_bfi_b32 v1, 0xffff, s2, v1 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: v_insertelement_v2bf16_0: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_movk_i32 s2, 0x40a0 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_bfi_b32 v1, 0xffff, s2, v1 +; GFX1250-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset +; GFX1250-FAKE16-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: v_insertelement_v2bf16_0: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv +; GFX1250-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset +; GFX1250-REAL16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v1.l, 0x40a0 +; GFX1250-REAL16-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset +; GFX1250-REAL16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -309,17 +322,29 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) % ; GFX942-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NEXT: s_endpgm ; -; GFX1250-LABEL: v_insertelement_v2bf16_0_inlineimm: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv -; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_bfi_b32 v1, 0xffff, 53, v1 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: v_insertelement_v2bf16_0_inlineimm: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_bfi_b32 v1, 0xffff, 53, v1 +; GFX1250-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset +; GFX1250-FAKE16-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: v_insertelement_v2bf16_0_inlineimm: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv +; GFX1250-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset +; GFX1250-REAL16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v1.l, 53 +; GFX1250-REAL16-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset +; GFX1250-REAL16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -396,19 +421,31 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; GFX942-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NEXT: s_endpgm ; -; GFX1250-LABEL: v_insertelement_v2bf16_1: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv -; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_movk_i32 s2, 0x40a0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_perm_b32 v1, s2, v1, 0x5040100 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: v_insertelement_v2bf16_1: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_movk_i32 s2, 0x40a0 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_perm_b32 v1, s2, v1, 0x5040100 +; GFX1250-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset +; GFX1250-FAKE16-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: v_insertelement_v2bf16_1: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv +; GFX1250-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset +; GFX1250-REAL16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v1.h, 0x40a0 +; GFX1250-REAL16-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset +; GFX1250-REAL16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -483,17 +520,29 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) % ; GFX942-NEXT: global_store_dword v0, v1, s[0:1] ; GFX942-NEXT: s_endpgm ; -; GFX1250-LABEL: v_insertelement_v2bf16_1_inlineimm: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv -; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_perm_b32 v1, 35, v1, 0x5040100 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: v_insertelement_v2bf16_1_inlineimm: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_perm_b32 v1, 35, v1, 0x5040100 +; GFX1250-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset +; GFX1250-FAKE16-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: v_insertelement_v2bf16_1_inlineimm: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv +; GFX1250-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset +; GFX1250-REAL16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v1.h, 35 +; GFX1250-REAL16-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset +; GFX1250-REAL16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -698,19 +747,33 @@ define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr a ; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX942-NEXT: s_endpgm ; -; GFX1250-LABEL: v_insertelement_v4bf16_0: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv -; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x30 nv -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[2:3] scale_offset -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_bfi_b32 v0, 0xffff, s4, v0 -; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: v_insertelement_v4bf16_0: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x30 nv +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: global_load_b64 v[0:1], v2, s[2:3] scale_offset +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, s4, v0 +; GFX1250-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-FAKE16-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: v_insertelement_v4bf16_0: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv +; GFX1250-REAL16-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-REAL16-NEXT: s_wait_xcnt 0x0 +; GFX1250-REAL16-NEXT: s_load_b32 s4, s[4:5], 0x30 nv +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: global_load_b64 v[0:1], v2, s[2:3] scale_offset +; GFX1250-REAL16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v0.l, s4 +; GFX1250-REAL16-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-REAL16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -793,19 +856,33 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a ; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX942-NEXT: s_endpgm ; -; GFX1250-LABEL: v_insertelement_v4bf16_1: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv -; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x10 nv -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[2:3] scale_offset -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_perm_b32 v0, s4, v0, 0x5040100 -; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: v_insertelement_v4bf16_1: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10 nv +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: global_load_b64 v[0:1], v2, s[2:3] scale_offset +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_perm_b32 v0, s4, v0, 0x5040100 +; GFX1250-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-FAKE16-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: v_insertelement_v4bf16_1: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv +; GFX1250-REAL16-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-REAL16-NEXT: s_wait_xcnt 0x0 +; GFX1250-REAL16-NEXT: s_load_b32 s4, s[4:5], 0x10 nv +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: global_load_b64 v[0:1], v2, s[2:3] scale_offset +; GFX1250-REAL16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v0.h, s4 +; GFX1250-REAL16-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-REAL16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -890,19 +967,33 @@ define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr a ; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX942-NEXT: s_endpgm ; -; GFX1250-LABEL: v_insertelement_v4bf16_2: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv -; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x30 nv -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[2:3] scale_offset -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_bfi_b32 v1, 0xffff, s4, v1 -; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: v_insertelement_v4bf16_2: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x30 nv +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: global_load_b64 v[0:1], v2, s[2:3] scale_offset +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_bfi_b32 v1, 0xffff, s4, v1 +; GFX1250-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-FAKE16-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: v_insertelement_v4bf16_2: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv +; GFX1250-REAL16-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-REAL16-NEXT: s_wait_xcnt 0x0 +; GFX1250-REAL16-NEXT: s_load_b32 s4, s[4:5], 0x30 nv +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: global_load_b64 v[0:1], v2, s[2:3] scale_offset +; GFX1250-REAL16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v1.l, s4 +; GFX1250-REAL16-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-REAL16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -985,19 +1076,33 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a ; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX942-NEXT: s_endpgm ; -; GFX1250-LABEL: v_insertelement_v4bf16_3: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv -; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x10 nv -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[2:3] scale_offset -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_perm_b32 v1, s4, v1, 0x5040100 -; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: v_insertelement_v4bf16_3: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10 nv +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: global_load_b64 v[0:1], v2, s[2:3] scale_offset +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_perm_b32 v1, s4, v1, 0x5040100 +; GFX1250-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-FAKE16-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: v_insertelement_v4bf16_3: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv +; GFX1250-REAL16-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX1250-REAL16-NEXT: s_wait_xcnt 0x0 +; GFX1250-REAL16-NEXT: s_load_b32 s4, s[4:5], 0x10 nv +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: global_load_b64 v[0:1], v2, s[2:3] scale_offset +; GFX1250-REAL16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v1.h, s4 +; GFX1250-REAL16-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset +; GFX1250-REAL16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -1204,19 +1309,33 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a ; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; -; GFX1250-LABEL: v_insertelement_v8bf16_3: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv -; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x10 nv -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b128 v[0:3], v4, s[2:3] scale_offset -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_perm_b32 v1, s4, v1, 0x5040100 -; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[0:1] scale_offset -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: v_insertelement_v8bf16_3: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10 nv +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: global_load_b128 v[0:3], v4, s[2:3] scale_offset +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_perm_b32 v1, s4, v1, 0x5040100 +; GFX1250-FAKE16-NEXT: global_store_b128 v4, v[0:3], s[0:1] scale_offset +; GFX1250-FAKE16-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: v_insertelement_v8bf16_3: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv +; GFX1250-REAL16-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-REAL16-NEXT: s_wait_xcnt 0x0 +; GFX1250-REAL16-NEXT: s_load_b32 s4, s[4:5], 0x10 nv +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: global_load_b128 v[0:3], v4, s[2:3] scale_offset +; GFX1250-REAL16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v1.h, s4 +; GFX1250-REAL16-NEXT: global_store_b128 v4, v[0:3], s[0:1] scale_offset +; GFX1250-REAL16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <8 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -1434,50 +1553,88 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX942-NEXT: s_endpgm ; -; GFX1250-LABEL: v_insertelement_v8bf16_dynamic: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv -; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 nv -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b128 v[0:3], v4, s[2:3] scale_offset -; GFX1250-NEXT: s_cmp_eq_u32 s5, 6 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1250-NEXT: s_cmp_eq_u32 s5, 7 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_cndmask_b32_e64 v5, v3, s4, s2 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1250-NEXT: s_cmp_eq_u32 s5, 4 -; GFX1250-NEXT: v_dual_lshrrev_b32 v3, 16, v3 :: v_dual_lshrrev_b32 v6, 16, v2 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 -; GFX1250-NEXT: s_cmp_eq_u32 s5, 5 -; GFX1250-NEXT: v_cndmask_b32_e64 v2, v2, s4, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 -; GFX1250-NEXT: s_cmp_eq_u32 s5, 2 -; GFX1250-NEXT: v_dual_lshrrev_b32 v7, 16, v1 :: v_dual_lshrrev_b32 v8, 16, v0 -; GFX1250-NEXT: v_cndmask_b32_e64 v3, v3, s4, s2 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1250-NEXT: s_cmp_eq_u32 s5, 3 -; GFX1250-NEXT: v_cndmask_b32_e64 v1, v1, s4, s2 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1250-NEXT: s_cmp_eq_u32 s5, 0 -; GFX1250-NEXT: v_cndmask_b32_e64 v7, v7, s4, s2 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1250-NEXT: s_cmp_eq_u32 s5, 1 -; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1250-NEXT: v_cndmask_b32_e64 v6, v6, s4, s3 -; GFX1250-NEXT: v_cndmask_b32_e64 v8, v8, s4, s2 -; GFX1250-NEXT: v_perm_b32 v3, v3, v5, 0x5040100 -; GFX1250-NEXT: v_perm_b32 v1, v7, v1, 0x5040100 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_perm_b32 v2, v6, v2, 0x5040100 -; GFX1250-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 -; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[0:1] scale_offset -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: v_insertelement_v8bf16_dynamic: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 nv +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: global_load_b128 v[0:3], v4, s[2:3] scale_offset +; GFX1250-FAKE16-NEXT: s_cmp_eq_u32 s5, 6 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-FAKE16-NEXT: s_cmp_eq_u32 s5, 7 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_cndmask_b32_e64 v5, v3, s4, s2 +; GFX1250-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-FAKE16-NEXT: s_cmp_eq_u32 s5, 4 +; GFX1250-FAKE16-NEXT: v_dual_lshrrev_b32 v3, 16, v3 :: v_dual_lshrrev_b32 v6, 16, v2 +; GFX1250-FAKE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-FAKE16-NEXT: s_cmp_eq_u32 s5, 5 +; GFX1250-FAKE16-NEXT: v_cndmask_b32_e64 v2, v2, s4, s3 +; GFX1250-FAKE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-FAKE16-NEXT: s_cmp_eq_u32 s5, 2 +; GFX1250-FAKE16-NEXT: v_dual_lshrrev_b32 v7, 16, v1 :: v_dual_lshrrev_b32 v8, 16, v0 +; GFX1250-FAKE16-NEXT: v_cndmask_b32_e64 v3, v3, s4, s2 +; GFX1250-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-FAKE16-NEXT: s_cmp_eq_u32 s5, 3 +; GFX1250-FAKE16-NEXT: v_cndmask_b32_e64 v1, v1, s4, s2 +; GFX1250-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-FAKE16-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1250-FAKE16-NEXT: v_cndmask_b32_e64 v7, v7, s4, s2 +; GFX1250-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-FAKE16-NEXT: s_cmp_eq_u32 s5, 1 +; GFX1250-FAKE16-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2 +; GFX1250-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-FAKE16-NEXT: v_cndmask_b32_e64 v6, v6, s4, s3 +; GFX1250-FAKE16-NEXT: v_cndmask_b32_e64 v8, v8, s4, s2 +; GFX1250-FAKE16-NEXT: v_perm_b32 v3, v3, v5, 0x5040100 +; GFX1250-FAKE16-NEXT: v_perm_b32 v1, v7, v1, 0x5040100 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x5040100 +; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 +; GFX1250-FAKE16-NEXT: global_store_b128 v4, v[0:3], s[0:1] scale_offset +; GFX1250-FAKE16-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: v_insertelement_v8bf16_dynamic: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv +; GFX1250-REAL16-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX1250-REAL16-NEXT: s_wait_xcnt 0x0 +; GFX1250-REAL16-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 nv +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: global_load_b128 v[0:3], v4, s[2:3] scale_offset +; GFX1250-REAL16-NEXT: s_cmp_eq_u32 s5, 6 +; GFX1250-REAL16-NEXT: s_wait_xcnt 0x0 +; GFX1250-REAL16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-REAL16-NEXT: s_cmp_eq_u32 s5, 7 +; GFX1250-REAL16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-REAL16-NEXT: s_cmp_eq_u32 s5, 4 +; GFX1250-REAL16-NEXT: s_cselect_b32 s6, -1, 0 +; GFX1250-REAL16-NEXT: s_cmp_eq_u32 s5, 5 +; GFX1250-REAL16-NEXT: s_cselect_b32 s7, -1, 0 +; GFX1250-REAL16-NEXT: s_cmp_eq_u32 s5, 2 +; GFX1250-REAL16-NEXT: s_cselect_b32 s8, -1, 0 +; GFX1250-REAL16-NEXT: s_cmp_eq_u32 s5, 3 +; GFX1250-REAL16-NEXT: s_cselect_b32 s9, -1, 0 +; GFX1250-REAL16-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1250-REAL16-NEXT: s_cselect_b32 s10, -1, 0 +; GFX1250-REAL16-NEXT: s_cmp_eq_u32 s5, 1 +; GFX1250-REAL16-NEXT: s_cselect_b32 s5, -1, 0 +; GFX1250-REAL16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-REAL16-NEXT: v_cndmask_b16 v3.l, v3.l, s4, s2 +; GFX1250-REAL16-NEXT: v_cndmask_b16 v3.h, v3.h, s4, s3 +; GFX1250-REAL16-NEXT: v_cndmask_b16 v2.l, v2.l, s4, s6 +; GFX1250-REAL16-NEXT: v_cndmask_b16 v2.h, v2.h, s4, s7 +; GFX1250-REAL16-NEXT: v_cndmask_b16 v1.l, v1.l, s4, s8 +; GFX1250-REAL16-NEXT: v_cndmask_b16 v1.h, v1.h, s4, s9 +; GFX1250-REAL16-NEXT: v_cndmask_b16 v0.l, v0.l, s4, s10 +; GFX1250-REAL16-NEXT: v_cndmask_b16 v0.h, v0.h, s4, s5 +; GFX1250-REAL16-NEXT: global_store_b128 v4, v[0:3], s[0:1] scale_offset +; GFX1250-REAL16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <8 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -1577,26 +1734,46 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr ; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; -; GFX1250-LABEL: v_insertelement_v16bf16_3: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv -; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x10 nv -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_clause 0x1 -; GFX1250-NEXT: global_load_b128 v[0:3], v8, s[2:3] -; GFX1250-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16 -; GFX1250-NEXT: s_wait_loadcnt 0x1 -; GFX1250-NEXT: v_perm_b32 v1, s4, v1, 0x5040100 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_clause 0x1 -; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 -; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: v_insertelement_v16bf16_3: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10 nv +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: s_clause 0x1 +; GFX1250-FAKE16-NEXT: global_load_b128 v[0:3], v8, s[2:3] +; GFX1250-FAKE16-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x1 +; GFX1250-FAKE16-NEXT: v_perm_b32 v1, s4, v1, 0x5040100 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: s_clause 0x1 +; GFX1250-FAKE16-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 +; GFX1250-FAKE16-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX1250-FAKE16-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: v_insertelement_v16bf16_3: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv +; GFX1250-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-REAL16-NEXT: s_wait_xcnt 0x0 +; GFX1250-REAL16-NEXT: s_load_b32 s4, s[4:5], 0x10 nv +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: s_clause 0x1 +; GFX1250-REAL16-NEXT: global_load_b128 v[0:3], v8, s[2:3] offset:16 +; GFX1250-REAL16-NEXT: global_load_b128 v[4:7], v8, s[2:3] +; GFX1250-REAL16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v5.h, s4 +; GFX1250-REAL16-NEXT: s_clause 0x1 +; GFX1250-REAL16-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 +; GFX1250-REAL16-NEXT: global_store_b128 v8, v[4:7], s[0:1] +; GFX1250-REAL16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <16 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext @@ -1971,87 +2148,156 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[36:37] ; GFX942-NEXT: s_endpgm ; -; GFX1250-LABEL: v_insertelement_v16bf16_dynamic: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv -; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 nv -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_clause 0x1 -; GFX1250-NEXT: global_load_b128 v[0:3], v8, s[2:3] -; GFX1250-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16 -; GFX1250-NEXT: s_cmp_eq_u32 s5, 6 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1250-NEXT: s_cmp_eq_u32 s5, 7 -; GFX1250-NEXT: s_wait_loadcnt 0x1 -; GFX1250-NEXT: v_cndmask_b32_e64 v9, v3, s4, s2 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1250-NEXT: s_cmp_eq_u32 s5, 4 -; GFX1250-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 -; GFX1250-NEXT: s_cmp_eq_u32 s5, 5 -; GFX1250-NEXT: v_dual_lshrrev_b32 v10, 16, v2 :: v_dual_lshrrev_b32 v11, 16, v1 -; GFX1250-NEXT: v_cndmask_b32_e64 v2, v2, s4, s3 -; GFX1250-NEXT: s_cselect_b32 s3, -1, 0 -; GFX1250-NEXT: s_cmp_eq_u32 s5, 2 -; GFX1250-NEXT: v_cndmask_b32_e64 v3, v3, s4, s2 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1250-NEXT: s_cmp_eq_u32 s5, 3 -; GFX1250-NEXT: v_cndmask_b32_e64 v1, v1, s4, s2 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1250-NEXT: s_cmp_eq_u32 s5, 0 -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_dual_lshrrev_b32 v12, 16, v0 :: v_dual_lshrrev_b32 v13, 16, v7 -; GFX1250-NEXT: v_cndmask_b32_e64 v11, v11, s4, s2 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1250-NEXT: s_cmp_eq_u32 s5, 1 -; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1250-NEXT: s_cmp_eq_u32 s5, 14 -; GFX1250-NEXT: v_cndmask_b32_e64 v10, v10, s4, s3 -; GFX1250-NEXT: v_perm_b32 v3, v3, v9, 0x5040100 -; GFX1250-NEXT: v_cndmask_b32_e64 v9, v12, s4, s2 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1250-NEXT: s_cmp_eq_u32 s5, 15 -; GFX1250-NEXT: v_cndmask_b32_e64 v7, v7, s4, s2 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1250-NEXT: s_cmp_eq_u32 s5, 12 -; GFX1250-NEXT: v_dual_lshrrev_b32 v14, 16, v6 :: v_dual_lshrrev_b32 v15, 16, v5 -; GFX1250-NEXT: v_perm_b32 v2, v10, v2, 0x5040100 -; GFX1250-NEXT: v_cndmask_b32_e64 v10, v13, s4, s2 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1250-NEXT: s_cmp_eq_u32 s5, 13 -; GFX1250-NEXT: v_cndmask_b32_e64 v6, v6, s4, s2 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1250-NEXT: s_cmp_eq_u32 s5, 10 -; GFX1250-NEXT: v_cndmask_b32_e64 v12, v14, s4, s2 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1250-NEXT: s_cmp_eq_u32 s5, 11 -; GFX1250-NEXT: v_cndmask_b32_e64 v5, v5, s4, s2 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1250-NEXT: s_cmp_eq_u32 s5, 8 -; GFX1250-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; GFX1250-NEXT: v_cndmask_b32_e64 v13, v15, s4, s2 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1250-NEXT: s_cmp_eq_u32 s5, 9 -; GFX1250-NEXT: v_cndmask_b32_e64 v4, v4, s4, s2 -; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1250-NEXT: v_perm_b32 v7, v10, v7, 0x5040100 -; GFX1250-NEXT: v_cndmask_b32_e64 v14, v16, s4, s2 -; GFX1250-NEXT: v_perm_b32 v6, v12, v6, 0x5040100 -; GFX1250-NEXT: v_perm_b32 v5, v13, v5, 0x5040100 -; GFX1250-NEXT: v_perm_b32 v1, v11, v1, 0x5040100 -; GFX1250-NEXT: v_perm_b32 v0, v9, v0, 0x5040100 -; GFX1250-NEXT: v_perm_b32 v4, v14, v4, 0x5040100 -; GFX1250-NEXT: s_clause 0x1 -; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 -; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: v_insertelement_v16bf16_dynamic: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 nv +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: s_clause 0x1 +; GFX1250-FAKE16-NEXT: global_load_b128 v[0:3], v8, s[2:3] +; GFX1250-FAKE16-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16 +; GFX1250-FAKE16-NEXT: s_cmp_eq_u32 s5, 6 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-FAKE16-NEXT: s_cmp_eq_u32 s5, 7 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x1 +; GFX1250-FAKE16-NEXT: v_cndmask_b32_e64 v9, v3, s4, s2 +; GFX1250-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-FAKE16-NEXT: s_cmp_eq_u32 s5, 4 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX1250-FAKE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-FAKE16-NEXT: s_cmp_eq_u32 s5, 5 +; GFX1250-FAKE16-NEXT: v_dual_lshrrev_b32 v10, 16, v2 :: v_dual_lshrrev_b32 v11, 16, v1 +; GFX1250-FAKE16-NEXT: v_cndmask_b32_e64 v2, v2, s4, s3 +; GFX1250-FAKE16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-FAKE16-NEXT: s_cmp_eq_u32 s5, 2 +; GFX1250-FAKE16-NEXT: v_cndmask_b32_e64 v3, v3, s4, s2 +; GFX1250-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-FAKE16-NEXT: s_cmp_eq_u32 s5, 3 +; GFX1250-FAKE16-NEXT: v_cndmask_b32_e64 v1, v1, s4, s2 +; GFX1250-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-FAKE16-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_dual_lshrrev_b32 v12, 16, v0 :: v_dual_lshrrev_b32 v13, 16, v7 +; GFX1250-FAKE16-NEXT: v_cndmask_b32_e64 v11, v11, s4, s2 +; GFX1250-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-FAKE16-NEXT: s_cmp_eq_u32 s5, 1 +; GFX1250-FAKE16-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2 +; GFX1250-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-FAKE16-NEXT: s_cmp_eq_u32 s5, 14 +; GFX1250-FAKE16-NEXT: v_cndmask_b32_e64 v10, v10, s4, s3 +; GFX1250-FAKE16-NEXT: v_perm_b32 v3, v3, v9, 0x5040100 +; GFX1250-FAKE16-NEXT: v_cndmask_b32_e64 v9, v12, s4, s2 +; GFX1250-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-FAKE16-NEXT: s_cmp_eq_u32 s5, 15 +; GFX1250-FAKE16-NEXT: v_cndmask_b32_e64 v7, v7, s4, s2 +; GFX1250-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-FAKE16-NEXT: s_cmp_eq_u32 s5, 12 +; GFX1250-FAKE16-NEXT: v_dual_lshrrev_b32 v14, 16, v6 :: v_dual_lshrrev_b32 v15, 16, v5 +; GFX1250-FAKE16-NEXT: v_perm_b32 v2, v10, v2, 0x5040100 +; GFX1250-FAKE16-NEXT: v_cndmask_b32_e64 v10, v13, s4, s2 +; GFX1250-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-FAKE16-NEXT: s_cmp_eq_u32 s5, 13 +; GFX1250-FAKE16-NEXT: v_cndmask_b32_e64 v6, v6, s4, s2 +; GFX1250-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-FAKE16-NEXT: s_cmp_eq_u32 s5, 10 +; GFX1250-FAKE16-NEXT: v_cndmask_b32_e64 v12, v14, s4, s2 +; GFX1250-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-FAKE16-NEXT: s_cmp_eq_u32 s5, 11 +; GFX1250-FAKE16-NEXT: v_cndmask_b32_e64 v5, v5, s4, s2 +; GFX1250-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-FAKE16-NEXT: s_cmp_eq_u32 s5, 8 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; GFX1250-FAKE16-NEXT: v_cndmask_b32_e64 v13, v15, s4, s2 +; GFX1250-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-FAKE16-NEXT: s_cmp_eq_u32 s5, 9 +; GFX1250-FAKE16-NEXT: v_cndmask_b32_e64 v4, v4, s4, s2 +; GFX1250-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-FAKE16-NEXT: v_perm_b32 v7, v10, v7, 0x5040100 +; GFX1250-FAKE16-NEXT: v_cndmask_b32_e64 v14, v16, s4, s2 +; GFX1250-FAKE16-NEXT: v_perm_b32 v6, v12, v6, 0x5040100 +; GFX1250-FAKE16-NEXT: v_perm_b32 v5, v13, v5, 0x5040100 +; GFX1250-FAKE16-NEXT: v_perm_b32 v1, v11, v1, 0x5040100 +; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v9, v0, 0x5040100 +; GFX1250-FAKE16-NEXT: v_perm_b32 v4, v14, v4, 0x5040100 +; GFX1250-FAKE16-NEXT: s_clause 0x1 +; GFX1250-FAKE16-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 +; GFX1250-FAKE16-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX1250-FAKE16-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: v_insertelement_v16bf16_dynamic: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 nv +; GFX1250-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-REAL16-NEXT: s_wait_xcnt 0x0 +; GFX1250-REAL16-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 nv +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: s_clause 0x1 +; GFX1250-REAL16-NEXT: global_load_b128 v[0:3], v8, s[2:3] +; GFX1250-REAL16-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16 +; GFX1250-REAL16-NEXT: s_cmp_eq_u32 s5, 6 +; GFX1250-REAL16-NEXT: s_wait_xcnt 0x0 +; GFX1250-REAL16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-REAL16-NEXT: s_cmp_eq_u32 s5, 7 +; GFX1250-REAL16-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1250-REAL16-NEXT: s_cmp_eq_u32 s5, 4 +; GFX1250-REAL16-NEXT: s_cselect_b32 s6, -1, 0 +; GFX1250-REAL16-NEXT: s_cmp_eq_u32 s5, 5 +; GFX1250-REAL16-NEXT: s_cselect_b32 s7, -1, 0 +; GFX1250-REAL16-NEXT: s_cmp_eq_u32 s5, 2 +; GFX1250-REAL16-NEXT: s_cselect_b32 s8, -1, 0 +; GFX1250-REAL16-NEXT: s_cmp_eq_u32 s5, 3 +; GFX1250-REAL16-NEXT: s_cselect_b32 s9, -1, 0 +; GFX1250-REAL16-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1250-REAL16-NEXT: s_cselect_b32 s10, -1, 0 +; GFX1250-REAL16-NEXT: s_cmp_eq_u32 s5, 1 +; GFX1250-REAL16-NEXT: s_cselect_b32 s11, -1, 0 +; GFX1250-REAL16-NEXT: s_cmp_eq_u32 s5, 14 +; GFX1250-REAL16-NEXT: s_cselect_b32 s12, -1, 0 +; GFX1250-REAL16-NEXT: s_cmp_eq_u32 s5, 15 +; GFX1250-REAL16-NEXT: s_cselect_b32 s13, -1, 0 +; GFX1250-REAL16-NEXT: s_cmp_eq_u32 s5, 12 +; GFX1250-REAL16-NEXT: s_cselect_b32 s14, -1, 0 +; GFX1250-REAL16-NEXT: s_cmp_eq_u32 s5, 13 +; GFX1250-REAL16-NEXT: s_cselect_b32 s15, -1, 0 +; GFX1250-REAL16-NEXT: s_cmp_eq_u32 s5, 10 +; GFX1250-REAL16-NEXT: s_cselect_b32 s16, -1, 0 +; GFX1250-REAL16-NEXT: s_cmp_eq_u32 s5, 11 +; GFX1250-REAL16-NEXT: s_cselect_b32 s17, -1, 0 +; GFX1250-REAL16-NEXT: s_cmp_eq_u32 s5, 8 +; GFX1250-REAL16-NEXT: s_cselect_b32 s18, -1, 0 +; GFX1250-REAL16-NEXT: s_cmp_eq_u32 s5, 9 +; GFX1250-REAL16-NEXT: s_cselect_b32 s5, -1, 0 +; GFX1250-REAL16-NEXT: s_wait_loadcnt 0x1 +; GFX1250-REAL16-NEXT: v_cndmask_b16 v3.l, v3.l, s4, s2 +; GFX1250-REAL16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-REAL16-NEXT: v_cndmask_b16 v7.l, v7.l, s4, s12 +; GFX1250-REAL16-NEXT: v_cndmask_b16 v7.h, v7.h, s4, s13 +; GFX1250-REAL16-NEXT: v_cndmask_b16 v6.l, v6.l, s4, s14 +; GFX1250-REAL16-NEXT: v_cndmask_b16 v6.h, v6.h, s4, s15 +; GFX1250-REAL16-NEXT: v_cndmask_b16 v5.l, v5.l, s4, s16 +; GFX1250-REAL16-NEXT: v_cndmask_b16 v5.h, v5.h, s4, s17 +; GFX1250-REAL16-NEXT: v_cndmask_b16 v4.l, v4.l, s4, s18 +; GFX1250-REAL16-NEXT: v_cndmask_b16 v4.h, v4.h, s4, s5 +; GFX1250-REAL16-NEXT: v_cndmask_b16 v3.h, v3.h, s4, s3 +; GFX1250-REAL16-NEXT: v_cndmask_b16 v2.l, v2.l, s4, s6 +; GFX1250-REAL16-NEXT: v_cndmask_b16 v2.h, v2.h, s4, s7 +; GFX1250-REAL16-NEXT: v_cndmask_b16 v1.l, v1.l, s4, s8 +; GFX1250-REAL16-NEXT: v_cndmask_b16 v1.h, v1.h, s4, s9 +; GFX1250-REAL16-NEXT: v_cndmask_b16 v0.l, v0.l, s4, s10 +; GFX1250-REAL16-NEXT: v_cndmask_b16 v0.h, v0.h, s4, s11 +; GFX1250-REAL16-NEXT: s_clause 0x1 +; GFX1250-REAL16-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 +; GFX1250-REAL16-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX1250-REAL16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <16 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index 10a24a1b3b5e1..815dcd268a0df 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -27,8 +27,10 @@ ; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1200,GFX1200-GISEL,GFX1200-GISEL-TRUE16 %s ; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1200,GFX1200-GISEL,GFX1200-GISEL-FAKE16 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s -; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG,GFX1250-FAKE16,GFX1250-SDAG-FAKE16 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16,GFX1250-GISEL-FAKE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG,GFX1250-REAL16,GFX1250-SDAG-REAL16 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16,GFX1250-GISEL-REAL16 %s ; Test for integer mad formation for patterns used in clpeak @@ -240,6 +242,39 @@ define i32 @clpeak_imad_pat_i32(i32 %x, i32 %y) { ; GFX1250-SDAG-NEXT: v_mad_u32 v0, v1, v0, v1 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; +; GFX1250-GISEL-FAKE16-LABEL: clpeak_imad_pat_i32: +; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v2, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v1, 1, v2 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v1, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v0, v1, v0 +; GFX1250-GISEL-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-REAL16-LABEL: clpeak_imad_pat_i32: +; GFX1250-GISEL-REAL16: ; %bb.0: ; %entry +; GFX1250-GISEL-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v2, v0, v1 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v1, 1, v2 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v1, v0, v1 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v0, v1, v0 +; GFX1250-GISEL-REAL16-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-GISEL-LABEL: clpeak_imad_pat_i32: ; GFX1250-GISEL: ; %bb.0: ; %entry ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -495,19 +530,67 @@ define signext i16 @clpeak_imad_pat_i16(i16 signext %x, i16 signext %y) { ; GFX1200-GISEL-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-SDAG-LABEL: clpeak_imad_pat_i16: -; GFX1250-SDAG: ; %bb.0: ; %entry -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX1250-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX1250-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] -; +; GFX1250-SDAG-FAKE16-LABEL: clpeak_imad_pat_i16: +; GFX1250-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX1250-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1250-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-FAKE16-LABEL: clpeak_imad_pat_i16: +; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1250-GISEL-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-SDAG-REAL16-LABEL: clpeak_imad_pat_i16: +; GFX1250-SDAG-REAL16: ; %bb.0: ; %entry +; GFX1250-SDAG-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l +; GFX1250-SDAG-REAL16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1250-SDAG-REAL16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-REAL16-LABEL: clpeak_imad_pat_i16: +; GFX1250-GISEL-REAL16: ; %bb.0: ; %entry +; GFX1250-GISEL-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX1250-GISEL-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1250-GISEL-REAL16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1250-GISEL-REAL16-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-GISEL-LABEL: clpeak_imad_pat_i16: ; GFX1250-GISEL: ; %bb.0: ; %entry ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1123,6 +1206,53 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) { ; GFX1250-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; +; GFX1250-GISEL-FAKE16-LABEL: clpeak_imad_pat_v3i16: +; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] +; GFX1250-GISEL-FAKE16-NEXT: v_pk_add_u16 v1, v1, 1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mad_u16 v4, v0, v2, v0 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mad_u16 v5, v1, v3, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mad_u16 v1, v1, v3, 1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mul_lo_u16 v6, v4, v2 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mul_lo_u16 v7, v5, v3 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0] +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mad_u16 v3, v5, v3, 1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mul_lo_u16 v0, v6, v0 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mul_lo_u16 v1, v7, v1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mul_lo_u16 v0, v0, v2 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mul_lo_u16 v1, v1, v3 +; GFX1250-GISEL-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-REAL16-LABEL: clpeak_imad_pat_v3i16: +; GFX1250-GISEL-REAL16: ; %bb.0: ; %entry +; GFX1250-GISEL-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] +; GFX1250-GISEL-REAL16-NEXT: v_pk_add_u16 v1, v1, 1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_pk_mad_u16 v4, v0, v2, v0 +; GFX1250-GISEL-REAL16-NEXT: v_pk_mad_u16 v5, v1, v3, v1 +; GFX1250-GISEL-REAL16-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX1250-GISEL-REAL16-NEXT: v_pk_mad_u16 v1, v1, v3, 1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-REAL16-NEXT: v_pk_mul_lo_u16 v6, v4, v2 +; GFX1250-GISEL-REAL16-NEXT: v_pk_mul_lo_u16 v7, v5, v3 +; GFX1250-GISEL-REAL16-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0] +; GFX1250-GISEL-REAL16-NEXT: v_pk_mad_u16 v3, v5, v3, 1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-REAL16-NEXT: v_pk_mul_lo_u16 v0, v6, v0 +; GFX1250-GISEL-REAL16-NEXT: v_pk_mul_lo_u16 v1, v7, v1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_pk_mul_lo_u16 v0, v0, v2 +; GFX1250-GISEL-REAL16-NEXT: v_pk_mul_lo_u16 v1, v1, v3 +; GFX1250-GISEL-REAL16-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-GISEL-LABEL: clpeak_imad_pat_v3i16: ; GFX1250-GISEL: ; %bb.0: ; %entry ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1620,6 +1750,53 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX1250-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; +; GFX1250-GISEL-FAKE16-LABEL: clpeak_imad_pat_v4i16: +; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] +; GFX1250-GISEL-FAKE16-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mad_u16 v4, v0, v2, v0 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mad_u16 v5, v1, v3, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mad_u16 v1, v1, v3, 1 op_sel_hi:[1,1,0] +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mul_lo_u16 v6, v4, v2 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mul_lo_u16 v7, v5, v3 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0] +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mad_u16 v3, v5, v3, 1 op_sel_hi:[1,1,0] +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mul_lo_u16 v0, v6, v0 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mul_lo_u16 v1, v7, v1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mul_lo_u16 v0, v0, v2 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mul_lo_u16 v1, v1, v3 +; GFX1250-GISEL-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-REAL16-LABEL: clpeak_imad_pat_v4i16: +; GFX1250-GISEL-REAL16: ; %bb.0: ; %entry +; GFX1250-GISEL-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] +; GFX1250-GISEL-REAL16-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_pk_mad_u16 v4, v0, v2, v0 +; GFX1250-GISEL-REAL16-NEXT: v_pk_mad_u16 v5, v1, v3, v1 +; GFX1250-GISEL-REAL16-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX1250-GISEL-REAL16-NEXT: v_pk_mad_u16 v1, v1, v3, 1 op_sel_hi:[1,1,0] +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-REAL16-NEXT: v_pk_mul_lo_u16 v6, v4, v2 +; GFX1250-GISEL-REAL16-NEXT: v_pk_mul_lo_u16 v7, v5, v3 +; GFX1250-GISEL-REAL16-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0] +; GFX1250-GISEL-REAL16-NEXT: v_pk_mad_u16 v3, v5, v3, 1 op_sel_hi:[1,1,0] +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-REAL16-NEXT: v_pk_mul_lo_u16 v0, v6, v0 +; GFX1250-GISEL-REAL16-NEXT: v_pk_mul_lo_u16 v1, v7, v1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_pk_mul_lo_u16 v0, v0, v2 +; GFX1250-GISEL-REAL16-NEXT: v_pk_mul_lo_u16 v1, v1, v3 +; GFX1250-GISEL-REAL16-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-GISEL-LABEL: clpeak_imad_pat_v4i16: ; GFX1250-GISEL: ; %bb.0: ; %entry ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1875,19 +2052,66 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) { ; GFX1200-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-SDAG-LABEL: clpeak_umad_pat_i16: -; GFX1250-SDAG: ; %bb.0: ; %entry -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX1250-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] -; +; GFX1250-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16: +; GFX1250-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX1250-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16: +; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-GISEL-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-SDAG-REAL16-LABEL: clpeak_umad_pat_i16: +; GFX1250-SDAG-REAL16: ; %bb.0: ; %entry +; GFX1250-SDAG-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l +; GFX1250-SDAG-REAL16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1250-SDAG-REAL16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-REAL16-LABEL: clpeak_umad_pat_i16: +; GFX1250-GISEL-REAL16: ; %bb.0: ; %entry +; GFX1250-GISEL-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX1250-GISEL-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1250-GISEL-REAL16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h +; GFX1250-GISEL-REAL16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1250-GISEL-REAL16-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-GISEL-LABEL: clpeak_umad_pat_i16: ; GFX1250-GISEL: ; %bb.0: ; %entry ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2503,6 +2727,53 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) { ; GFX1250-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; +; GFX1250-GISEL-FAKE16-LABEL: clpeak_umad_pat_v3i16: +; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] +; GFX1250-GISEL-FAKE16-NEXT: v_pk_add_u16 v1, v1, 1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mad_u16 v4, v0, v2, v0 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mad_u16 v5, v1, v3, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mad_u16 v1, v1, v3, 1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mul_lo_u16 v6, v4, v2 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mul_lo_u16 v7, v5, v3 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0] +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mad_u16 v3, v5, v3, 1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mul_lo_u16 v0, v6, v0 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mul_lo_u16 v1, v7, v1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mul_lo_u16 v0, v0, v2 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mul_lo_u16 v1, v1, v3 +; GFX1250-GISEL-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-REAL16-LABEL: clpeak_umad_pat_v3i16: +; GFX1250-GISEL-REAL16: ; %bb.0: ; %entry +; GFX1250-GISEL-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] +; GFX1250-GISEL-REAL16-NEXT: v_pk_add_u16 v1, v1, 1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_pk_mad_u16 v4, v0, v2, v0 +; GFX1250-GISEL-REAL16-NEXT: v_pk_mad_u16 v5, v1, v3, v1 +; GFX1250-GISEL-REAL16-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX1250-GISEL-REAL16-NEXT: v_pk_mad_u16 v1, v1, v3, 1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-REAL16-NEXT: v_pk_mul_lo_u16 v6, v4, v2 +; GFX1250-GISEL-REAL16-NEXT: v_pk_mul_lo_u16 v7, v5, v3 +; GFX1250-GISEL-REAL16-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0] +; GFX1250-GISEL-REAL16-NEXT: v_pk_mad_u16 v3, v5, v3, 1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-REAL16-NEXT: v_pk_mul_lo_u16 v0, v6, v0 +; GFX1250-GISEL-REAL16-NEXT: v_pk_mul_lo_u16 v1, v7, v1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_pk_mul_lo_u16 v0, v0, v2 +; GFX1250-GISEL-REAL16-NEXT: v_pk_mul_lo_u16 v1, v1, v3 +; GFX1250-GISEL-REAL16-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-GISEL-LABEL: clpeak_umad_pat_v3i16: ; GFX1250-GISEL: ; %bb.0: ; %entry ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3000,6 +3271,53 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX1250-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; +; GFX1250-GISEL-FAKE16-LABEL: clpeak_umad_pat_v4i16: +; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] +; GFX1250-GISEL-FAKE16-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mad_u16 v4, v0, v2, v0 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mad_u16 v5, v1, v3, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mad_u16 v1, v1, v3, 1 op_sel_hi:[1,1,0] +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mul_lo_u16 v6, v4, v2 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mul_lo_u16 v7, v5, v3 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0] +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mad_u16 v3, v5, v3, 1 op_sel_hi:[1,1,0] +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mul_lo_u16 v0, v6, v0 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mul_lo_u16 v1, v7, v1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mul_lo_u16 v0, v0, v2 +; GFX1250-GISEL-FAKE16-NEXT: v_pk_mul_lo_u16 v1, v1, v3 +; GFX1250-GISEL-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-REAL16-LABEL: clpeak_umad_pat_v4i16: +; GFX1250-GISEL-REAL16: ; %bb.0: ; %entry +; GFX1250-GISEL-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] +; GFX1250-GISEL-REAL16-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_pk_mad_u16 v4, v0, v2, v0 +; GFX1250-GISEL-REAL16-NEXT: v_pk_mad_u16 v5, v1, v3, v1 +; GFX1250-GISEL-REAL16-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0] +; GFX1250-GISEL-REAL16-NEXT: v_pk_mad_u16 v1, v1, v3, 1 op_sel_hi:[1,1,0] +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-REAL16-NEXT: v_pk_mul_lo_u16 v6, v4, v2 +; GFX1250-GISEL-REAL16-NEXT: v_pk_mul_lo_u16 v7, v5, v3 +; GFX1250-GISEL-REAL16-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0] +; GFX1250-GISEL-REAL16-NEXT: v_pk_mad_u16 v3, v5, v3, 1 op_sel_hi:[1,1,0] +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-REAL16-NEXT: v_pk_mul_lo_u16 v0, v6, v0 +; GFX1250-GISEL-REAL16-NEXT: v_pk_mul_lo_u16 v1, v7, v1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_pk_mul_lo_u16 v0, v0, v2 +; GFX1250-GISEL-REAL16-NEXT: v_pk_mul_lo_u16 v1, v1, v3 +; GFX1250-GISEL-REAL16-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-GISEL-LABEL: clpeak_umad_pat_v4i16: ; GFX1250-GISEL: ; %bb.0: ; %entry ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3360,6 +3678,51 @@ define <2 x i32> @clpeak_imad_pat_v2i32(<2 x i32> %x, <2 x i32> %y) { ; GFX1250-SDAG-NEXT: v_mad_u32 v1, v3, v1, v3 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; +; GFX1250-GISEL-FAKE16-LABEL: clpeak_imad_pat_v2i32: +; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_dual_add_nc_u32 v0, 1, v0 :: v_dual_add_nc_u32 v1, 1, v1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v4, v0, v2 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v5, v1, v3 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_dual_add_nc_u32 v0, v4, v0 :: v_dual_add_nc_u32 v1, v5, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v1, v1, v3 +; GFX1250-GISEL-FAKE16-NEXT: v_dual_add_nc_u32 v2, 1, v4 :: v_dual_add_nc_u32 v3, 1, v5 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v2, v0, v2 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX1250-GISEL-FAKE16-NEXT: v_dual_add_nc_u32 v0, 1, v0 :: v_dual_add_nc_u32 v1, 1, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v0, v2, v0 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v1, v3, v1 +; GFX1250-GISEL-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-REAL16-LABEL: clpeak_imad_pat_v2i32: +; GFX1250-GISEL-REAL16: ; %bb.0: ; %entry +; GFX1250-GISEL-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: v_dual_add_nc_u32 v0, 1, v0 :: v_dual_add_nc_u32 v1, 1, v1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v4, v0, v2 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v5, v1, v3 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_dual_add_nc_u32 v0, v4, v0 :: v_dual_add_nc_u32 v1, v5, v1 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v1, v1, v3 +; GFX1250-GISEL-REAL16-NEXT: v_dual_add_nc_u32 v2, 1, v4 :: v_dual_add_nc_u32 v3, 1, v5 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v2, v0, v2 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX1250-GISEL-REAL16-NEXT: v_dual_add_nc_u32 v0, 1, v0 :: v_dual_add_nc_u32 v1, 1, v1 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v0, v2, v0 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v1, v3, v1 +; GFX1250-GISEL-REAL16-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-GISEL-LABEL: clpeak_imad_pat_v2i32: ; GFX1250-GISEL: ; %bb.0: ; %entry ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -3843,6 +4206,77 @@ define <3 x i32> @clpeak_imad_pat_v3i32(<3 x i32> %x, <3 x i32> %y) { ; GFX1250-SDAG-NEXT: v_mad_u32 v2, v5, v2, v5 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; +; GFX1250-GISEL-FAKE16-LABEL: clpeak_imad_pat_v3i32: +; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_dual_add_nc_u32 v0, 1, v0 :: v_dual_add_nc_u32 v1, 1, v1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v6, v0, v3 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v2, 1, v2 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v7, v1, v4 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v8, v2, v5 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v1, v7, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v0, v0, v3 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v3, 1, v6 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v2, v8, v2 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v1, v1, v4 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v4, 1, v7 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v3, v0, v3 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v2, v2, v5 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v5, 1, v8 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v4, v1, v4 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v1, 1, v1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v5, v2, v5 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v2, 1, v2 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v0, v3, v0 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v1, v4, v1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v2, v5, v2 +; GFX1250-GISEL-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-REAL16-LABEL: clpeak_imad_pat_v3i32: +; GFX1250-GISEL-REAL16: ; %bb.0: ; %entry +; GFX1250-GISEL-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: v_dual_add_nc_u32 v0, 1, v0 :: v_dual_add_nc_u32 v1, 1, v1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v6, v0, v3 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v2, 1, v2 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v7, v1, v4 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v8, v2, v5 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v1, v7, v1 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v0, v0, v3 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v3, 1, v6 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v2, v8, v2 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v1, v1, v4 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v4, 1, v7 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v3, v0, v3 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v2, v2, v5 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v5, 1, v8 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v4, v1, v4 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v1, 1, v1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v5, v2, v5 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v2, 1, v2 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v0, v3, v0 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v1, v4, v1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v2, v5, v2 +; GFX1250-GISEL-REAL16-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-GISEL-LABEL: clpeak_imad_pat_v3i32: ; GFX1250-GISEL: ; %bb.0: ; %entry ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -4412,6 +4846,83 @@ define <4 x i32> @clpeak_imad_pat_v4i32(<4 x i32> %x, <4 x i32> %y) { ; GFX1250-SDAG-NEXT: v_mad_u32 v3, v7, v3, v7 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; +; GFX1250-GISEL-FAKE16-LABEL: clpeak_imad_pat_v4i32: +; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_dual_add_nc_u32 v0, 1, v0 :: v_dual_add_nc_u32 v1, 1, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_dual_add_nc_u32 v2, 1, v2 :: v_dual_add_nc_u32 v3, 1, v3 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v8, v0, v4 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v9, v1, v5 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v10, v2, v6 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v11, v3, v7 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_dual_add_nc_u32 v0, v8, v0 :: v_dual_add_nc_u32 v1, v9, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_dual_add_nc_u32 v2, v10, v2 :: v_dual_add_nc_u32 v3, v11, v3 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v0, v0, v4 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v1, v1, v5 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v2, v2, v6 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v3, v3, v7 +; GFX1250-GISEL-FAKE16-NEXT: v_dual_add_nc_u32 v4, 1, v8 :: v_dual_add_nc_u32 v5, 1, v9 +; GFX1250-GISEL-FAKE16-NEXT: v_dual_add_nc_u32 v6, 1, v10 :: v_dual_add_nc_u32 v7, 1, v11 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v8, 1, v0 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v0, v0, v4 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v4, v1, v5 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v5, v2, v6 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v6, v3, v7 +; GFX1250-GISEL-FAKE16-NEXT: v_dual_add_nc_u32 v1, 1, v1 :: v_dual_add_nc_u32 v2, 1, v2 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v3, 1, v3 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v0, v0, v8 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v1, v4, v1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v2, v5, v2 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v3, v6, v3 +; GFX1250-GISEL-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-REAL16-LABEL: clpeak_imad_pat_v4i32: +; GFX1250-GISEL-REAL16: ; %bb.0: ; %entry +; GFX1250-GISEL-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: v_dual_add_nc_u32 v0, 1, v0 :: v_dual_add_nc_u32 v1, 1, v1 +; GFX1250-GISEL-REAL16-NEXT: v_dual_add_nc_u32 v2, 1, v2 :: v_dual_add_nc_u32 v3, 1, v3 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v8, v0, v4 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v9, v1, v5 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v10, v2, v6 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v11, v3, v7 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_dual_add_nc_u32 v0, v8, v0 :: v_dual_add_nc_u32 v1, v9, v1 +; GFX1250-GISEL-REAL16-NEXT: v_dual_add_nc_u32 v2, v10, v2 :: v_dual_add_nc_u32 v3, v11, v3 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v0, v0, v4 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v1, v1, v5 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v2, v2, v6 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v3, v3, v7 +; GFX1250-GISEL-REAL16-NEXT: v_dual_add_nc_u32 v4, 1, v8 :: v_dual_add_nc_u32 v5, 1, v9 +; GFX1250-GISEL-REAL16-NEXT: v_dual_add_nc_u32 v6, 1, v10 :: v_dual_add_nc_u32 v7, 1, v11 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v8, 1, v0 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v0, v0, v4 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v4, v1, v5 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v5, v2, v6 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v6, v3, v7 +; GFX1250-GISEL-REAL16-NEXT: v_dual_add_nc_u32 v1, 1, v1 :: v_dual_add_nc_u32 v2, 1, v2 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v3, 1, v3 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v0, v0, v8 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v1, v4, v1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v2, v5, v2 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v3, v6, v3 +; GFX1250-GISEL-REAL16-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-GISEL-LABEL: clpeak_imad_pat_v4i32: ; GFX1250-GISEL: ; %bb.0: ; %entry ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -4700,6 +5211,43 @@ define i32 @clpeak_imad_pat_i24(i32 %x, i32 %y) { ; GFX1250-SDAG-NEXT: v_mad_u32 v0, v1, v0, v1 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; +; GFX1250-GISEL-FAKE16-LABEL: clpeak_imad_pat_i24: +; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GFX1250-GISEL-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 24 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v2, v1, v0 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v1, 1, v2 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v1, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v0, v1, v0 +; GFX1250-GISEL-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-REAL16-LABEL: clpeak_imad_pat_i24: +; GFX1250-GISEL-REAL16: ; %bb.0: ; %entry +; GFX1250-GISEL-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GFX1250-GISEL-REAL16-NEXT: v_bfe_i32 v1, v1, 0, 24 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v2, v1, v0 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v1, 1, v2 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v1, v0, v1 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v0, v1, v0 +; GFX1250-GISEL-REAL16-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-GISEL-LABEL: clpeak_imad_pat_i24: ; GFX1250-GISEL: ; %bb.0: ; %entry ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -4972,6 +5520,43 @@ define i32 @clpeak_imad_pat_u24(i32 %x, i32 %y) { ; GFX1250-SDAG-NEXT: v_mad_u32 v0, v1, v0, v1 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; +; GFX1250-GISEL-FAKE16-LABEL: clpeak_imad_pat_u24: +; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX1250-GISEL-FAKE16-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v2, v1, v0 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v1, 1, v2 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v1, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v0, v1, v0 +; GFX1250-GISEL-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-REAL16-LABEL: clpeak_imad_pat_u24: +; GFX1250-GISEL-REAL16: ; %bb.0: ; %entry +; GFX1250-GISEL-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX1250-GISEL-REAL16-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v2, v1, v0 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v1, 1, v2 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v1, v0, v1 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v0, v1, v0 +; GFX1250-GISEL-REAL16-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-GISEL-LABEL: clpeak_imad_pat_u24: ; GFX1250-GISEL: ; %bb.0: ; %entry ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5231,19 +5816,67 @@ define signext i8 @clpeak_imad_pat_i8(i8 signext %x, i8 signext %y) { ; GFX1200-GISEL-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-SDAG-LABEL: clpeak_imad_pat_i8: -; GFX1250-SDAG: ; %bb.0: ; %entry -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX1250-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX1250-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] -; +; GFX1250-SDAG-FAKE16-LABEL: clpeak_imad_pat_i8: +; GFX1250-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX1250-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX1250-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-FAKE16-LABEL: clpeak_imad_pat_i8: +; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX1250-GISEL-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-SDAG-REAL16-LABEL: clpeak_imad_pat_i8: +; GFX1250-SDAG-REAL16: ; %bb.0: ; %entry +; GFX1250-SDAG-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l +; GFX1250-SDAG-REAL16-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX1250-SDAG-REAL16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-REAL16-LABEL: clpeak_imad_pat_i8: +; GFX1250-GISEL-REAL16: ; %bb.0: ; %entry +; GFX1250-GISEL-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX1250-GISEL-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1250-GISEL-REAL16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX1250-GISEL-REAL16-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-GISEL-LABEL: clpeak_imad_pat_i8: ; GFX1250-GISEL: ; %bb.0: ; %entry ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -5681,32 +6314,105 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) { ; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v1, v3 ; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-SDAG-LABEL: clpeak_imad_pat_v2i8: -; GFX1250-SDAG: ; %bb.0: ; %entry -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX1250-SDAG-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1250-SDAG-NEXT: v_mad_u16 v4, v1, v3, v1 -; GFX1250-SDAG-NEXT: v_mul_lo_u16 v1, v1, v3 -; GFX1250-SDAG-NEXT: v_mad_u16 v5, v0, v2, v0 -; GFX1250-SDAG-NEXT: v_mul_lo_u16 v0, v0, v2 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-SDAG-NEXT: v_mul_lo_u16 v3, v4, v3 -; GFX1250-SDAG-NEXT: v_mul_lo_u16 v2, v5, v2 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-SDAG-NEXT: v_mad_u16 v1, v3, v1, v3 -; GFX1250-SDAG-NEXT: v_mad_u16 v0, v2, v0, v2 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-SDAG-NEXT: v_mad_u16 v1, v1, v3, v1 -; GFX1250-SDAG-NEXT: v_mad_u16 v0, v0, v2, v0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-SDAG-NEXT: v_lshlrev_b16 v2, 8, v1 -; GFX1250-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX1250-SDAG-NEXT: v_bitop3_b16 v0, v0, v2, 0xff bitop3:0xec -; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] -; +; GFX1250-SDAG-FAKE16-LABEL: clpeak_imad_pat_v2i8: +; GFX1250-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX1250-SDAG-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v4, v1, v3, v1 +; GFX1250-SDAG-FAKE16-NEXT: v_mul_lo_u16 v1, v1, v3 +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v5, v0, v2, v0 +; GFX1250-SDAG-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v2 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-SDAG-FAKE16-NEXT: v_mul_lo_u16 v3, v4, v3 +; GFX1250-SDAG-FAKE16-NEXT: v_mul_lo_u16 v2, v5, v2 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v1, v3, v1, v3 +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v0, v2, v0, v2 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v1, v1, v3, v1 +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v2, v0 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-SDAG-FAKE16-NEXT: v_lshlrev_b16 v2, 8, v1 +; GFX1250-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX1250-SDAG-FAKE16-NEXT: v_bitop3_b16 v0, v0, v2, 0xff bitop3:0xec +; GFX1250-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-FAKE16-LABEL: clpeak_imad_pat_v2i8: +; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_mad_u16 v4, v0, v2, v0 +; GFX1250-GISEL-FAKE16-NEXT: v_mad_u16 v5, v1, v3, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v2, 1 +; GFX1250-GISEL-FAKE16-NEXT: v_mad_u16 v1, v1, v3, 1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v6, v4, v2 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v7, v5, v3 +; GFX1250-GISEL-FAKE16-NEXT: v_mad_u16 v2, v4, v2, 1 +; GFX1250-GISEL-FAKE16-NEXT: v_mad_u16 v3, v5, v3, 1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v6, v0 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v7, v1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v2 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v1, v3 +; GFX1250-GISEL-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-SDAG-REAL16-LABEL: clpeak_imad_pat_v2i8: +; GFX1250-SDAG-REAL16: ; %bb.0: ; %entry +; GFX1250-SDAG-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-REAL16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX1250-SDAG-REAL16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v1.l, v0.h, v3.l, v0.h +; GFX1250-SDAG-REAL16-NEXT: v_mul_lo_u16 v0.h, v0.h, v3.l +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v1.h, v0.l, v2.l, v0.l +; GFX1250-SDAG-REAL16-NEXT: v_mul_lo_u16 v0.l, v0.l, v2.l +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-SDAG-REAL16-NEXT: v_mul_lo_u16 v1.l, v1.l, v3.l +; GFX1250-SDAG-REAL16-NEXT: v_mul_lo_u16 v1.h, v1.h, v2.l +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.h, v1.l, v0.h, v1.l +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.l, v1.h, v0.l, v1.h +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.h, v0.h, v1.l, v0.h +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.h, v0.l +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-REAL16-NEXT: v_lshlrev_b16 v1.l, 8, v0.h +; GFX1250-SDAG-REAL16-NEXT: v_bitop3_b16 v0.l, v0.l, v1.l, 0xff bitop3:0xec +; GFX1250-SDAG-REAL16-NEXT: v_and_b16 v1.l, 0xff, v0.h +; GFX1250-SDAG-REAL16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-REAL16-LABEL: clpeak_imad_pat_v2i8: +; GFX1250-GISEL-REAL16: ; %bb.0: ; %entry +; GFX1250-GISEL-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_mad_u16 v1.l, v0.l, v2.l, v0.l +; GFX1250-GISEL-REAL16-NEXT: v_mad_u16 v1.h, v0.h, v3.l, v0.h +; GFX1250-GISEL-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v2.l, 1 +; GFX1250-GISEL-REAL16-NEXT: v_mad_u16 v0.h, v0.h, v3.l, 1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v2.h, v1.l, v2.l +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v3.h, v1.h, v3.l +; GFX1250-GISEL-REAL16-NEXT: v_mad_u16 v1.l, v1.l, v2.l, 1 +; GFX1250-GISEL-REAL16-NEXT: v_mad_u16 v1.h, v1.h, v3.l, 1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v0.l, v2.h, v0.l +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v0.h, v3.h, v0.h +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1250-GISEL-REAL16-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-GISEL-LABEL: clpeak_imad_pat_v2i8: ; GFX1250-GISEL: ; %bb.0: ; %entry ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -6254,6 +6960,39 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX1250-SDAG-NEXT: v_mad_u32 v1, v6, v3, v1 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; +; GFX1250-GISEL-FAKE16-LABEL: clpeak_imad_pat_i64: +; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u64_e32 v[0:1], 1, v[0:1] +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_u64_e32 v[4:5], v[0:1], v[2:3] +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u64_e32 v[0:1], v[4:5], v[0:1] +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_u64_e32 v[0:1], v[0:1], v[2:3] +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[4:5] +; GFX1250-GISEL-FAKE16-NEXT: v_mul_u64_e32 v[2:3], v[0:1], v[2:3] +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u64_e32 v[0:1], 1, v[0:1] +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_u64_e32 v[0:1], v[2:3], v[0:1] +; GFX1250-GISEL-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-REAL16-LABEL: clpeak_imad_pat_i64: +; GFX1250-GISEL-REAL16: ; %bb.0: ; %entry +; GFX1250-GISEL-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u64_e32 v[0:1], 1, v[0:1] +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_mul_u64_e32 v[4:5], v[0:1], v[2:3] +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u64_e32 v[0:1], v[4:5], v[0:1] +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_mul_u64_e32 v[0:1], v[0:1], v[2:3] +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[4:5] +; GFX1250-GISEL-REAL16-NEXT: v_mul_u64_e32 v[2:3], v[0:1], v[2:3] +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u64_e32 v[0:1], 1, v[0:1] +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_mul_u64_e32 v[0:1], v[2:3], v[0:1] +; GFX1250-GISEL-REAL16-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-GISEL-LABEL: clpeak_imad_pat_i64: ; GFX1250-GISEL: ; %bb.0: ; %entry ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -7189,6 +7928,59 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX1250-SDAG-NEXT: v_mad_u32 v3, v14, v7, v3 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; +; GFX1250-GISEL-FAKE16-LABEL: clpeak_imad_pat_v2i64: +; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u64_e32 v[0:1], 1, v[0:1] +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[2:3] +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_u64_e32 v[8:9], v[0:1], v[4:5] +; GFX1250-GISEL-FAKE16-NEXT: v_mul_u64_e32 v[10:11], v[2:3], v[6:7] +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u64_e32 v[0:1], v[8:9], v[0:1] +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u64_e32 v[2:3], v[10:11], v[2:3] +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_u64_e32 v[0:1], v[0:1], v[4:5] +; GFX1250-GISEL-FAKE16-NEXT: v_mul_u64_e32 v[2:3], v[2:3], v[6:7] +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[8:9] +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u64_e32 v[6:7], 1, v[10:11] +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_u64_e32 v[4:5], v[0:1], v[4:5] +; GFX1250-GISEL-FAKE16-NEXT: v_mul_u64_e32 v[6:7], v[2:3], v[6:7] +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u64_e32 v[0:1], 1, v[0:1] +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[2:3] +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_u64_e32 v[0:1], v[4:5], v[0:1] +; GFX1250-GISEL-FAKE16-NEXT: v_mul_u64_e32 v[2:3], v[6:7], v[2:3] +; GFX1250-GISEL-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-REAL16-LABEL: clpeak_imad_pat_v2i64: +; GFX1250-GISEL-REAL16: ; %bb.0: ; %entry +; GFX1250-GISEL-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u64_e32 v[0:1], 1, v[0:1] +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[2:3] +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_mul_u64_e32 v[8:9], v[0:1], v[4:5] +; GFX1250-GISEL-REAL16-NEXT: v_mul_u64_e32 v[10:11], v[2:3], v[6:7] +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u64_e32 v[0:1], v[8:9], v[0:1] +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u64_e32 v[2:3], v[10:11], v[2:3] +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_mul_u64_e32 v[0:1], v[0:1], v[4:5] +; GFX1250-GISEL-REAL16-NEXT: v_mul_u64_e32 v[2:3], v[2:3], v[6:7] +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[8:9] +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u64_e32 v[6:7], 1, v[10:11] +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_mul_u64_e32 v[4:5], v[0:1], v[4:5] +; GFX1250-GISEL-REAL16-NEXT: v_mul_u64_e32 v[6:7], v[2:3], v[6:7] +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u64_e32 v[0:1], 1, v[0:1] +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[2:3] +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_mul_u64_e32 v[0:1], v[4:5], v[0:1] +; GFX1250-GISEL-REAL16-NEXT: v_mul_u64_e32 v[2:3], v[6:7], v[2:3] +; GFX1250-GISEL-REAL16-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX1250-GISEL: ; %bb.0: ; %entry ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -7497,6 +8289,53 @@ define i32 @v_multi_use_mul_chain_add_other_use_all(i32 %arg, i32 %arg1, i32 %ar ; GFX1250-SDAG-NEXT: v_add_nc_u32_e32 v0, v3, v0 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; +; GFX1250-GISEL-FAKE16-LABEL: v_multi_use_mul_chain_add_other_use_all: +; GFX1250-GISEL-FAKE16: ; %bb.0: ; %bb +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_dual_add_nc_u32 v0, 1, v0 :: v_dual_mov_b32 v2, v3 +; GFX1250-GISEL-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v4, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v0, v4, v0 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v1, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v0, 1, v4 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v5, v1, v0 +; GFX1250-GISEL-FAKE16-NEXT: global_store_b32 v[2:3], v4, off scope:SCOPE_SYS +; GFX1250-GISEL-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: global_store_b32 v[2:3], v1, off scope:SCOPE_SYS +; GFX1250-GISEL-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: global_store_b32 v[2:3], v5, off scope:SCOPE_SYS +; GFX1250-GISEL-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v0, v5, v0 +; GFX1250-GISEL-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-REAL16-LABEL: v_multi_use_mul_chain_add_other_use_all: +; GFX1250-GISEL-REAL16: ; %bb.0: ; %bb +; GFX1250-GISEL-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: v_dual_add_nc_u32 v0, 1, v0 :: v_dual_mov_b32 v2, v3 +; GFX1250-GISEL-REAL16-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v4, v0, v1 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v0, v4, v0 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v1, v0, v1 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v0, 1, v4 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v5, v1, v0 +; GFX1250-GISEL-REAL16-NEXT: global_store_b32 v[2:3], v4, off scope:SCOPE_SYS +; GFX1250-GISEL-REAL16-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: global_store_b32 v[2:3], v1, off scope:SCOPE_SYS +; GFX1250-GISEL-REAL16-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: global_store_b32 v[2:3], v5, off scope:SCOPE_SYS +; GFX1250-GISEL-REAL16-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v0, v5, v0 +; GFX1250-GISEL-REAL16-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-GISEL-LABEL: v_multi_use_mul_chain_add_other_use_all: ; GFX1250-GISEL: ; %bb.0: ; %bb ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -7775,6 +8614,47 @@ define i32 @v_multi_use_mul_chain_add_other_use_some(i32 %arg, i32 %arg1, i32 %a ; GFX1250-SDAG-NEXT: v_add_nc_u32_e32 v0, v3, v1 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; +; GFX1250-GISEL-FAKE16-LABEL: v_multi_use_mul_chain_add_other_use_some: +; GFX1250-GISEL-FAKE16: ; %bb.0: ; %bb +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_dual_add_nc_u32 v0, 1, v0 :: v_dual_mov_b32 v2, v3 +; GFX1250-GISEL-FAKE16-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v4, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v0, v4, v0 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v1, 1, v4 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v5, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: global_store_b32 v[2:3], v4, off scope:SCOPE_SYS +; GFX1250-GISEL-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: global_store_b32 v[2:3], v5, off scope:SCOPE_SYS +; GFX1250-GISEL-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v0, v5, v1 +; GFX1250-GISEL-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-REAL16-LABEL: v_multi_use_mul_chain_add_other_use_some: +; GFX1250-GISEL-REAL16: ; %bb.0: ; %bb +; GFX1250-GISEL-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: v_dual_add_nc_u32 v0, 1, v0 :: v_dual_mov_b32 v2, v3 +; GFX1250-GISEL-REAL16-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v4, v0, v1 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v0, v4, v0 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v1, 1, v4 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v5, v0, v1 +; GFX1250-GISEL-REAL16-NEXT: global_store_b32 v[2:3], v4, off scope:SCOPE_SYS +; GFX1250-GISEL-REAL16-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: global_store_b32 v[2:3], v5, off scope:SCOPE_SYS +; GFX1250-GISEL-REAL16-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v0, v5, v1 +; GFX1250-GISEL-REAL16-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-GISEL-LABEL: v_multi_use_mul_chain_add_other_use_some: ; GFX1250-GISEL: ; %bb.0: ; %bb ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8151,6 +9031,61 @@ define i32 @clpeak_imad_pat_i32_x2(i32 %x, i32 %y) { ; GFX1250-SDAG-NEXT: v_mad_u32 v0, v1, v0, v1 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; +; GFX1250-GISEL-FAKE16-LABEL: clpeak_imad_pat_i32_x2: +; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v2, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v1, 1, v2 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v2, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v1, v2, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v0, v1, v0 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v1, 1, v2 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v2, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v1, v2, v1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v0, v1, v0 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v1, 1, v2 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v1, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v0, v1, v0 +; GFX1250-GISEL-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-REAL16-LABEL: clpeak_imad_pat_i32_x2: +; GFX1250-GISEL-REAL16: ; %bb.0: ; %entry +; GFX1250-GISEL-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v2, v0, v1 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v1, 1, v2 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v2, v0, v1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v1, v2, v1 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v0, v1, v0 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v1, 1, v2 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v2, v0, v1 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v1, v2, v1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v0, v1, v0 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v1, 1, v2 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v1, v0, v1 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v0, v1, v0 +; GFX1250-GISEL-REAL16-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-GISEL-LABEL: clpeak_imad_pat_i32_x2: ; GFX1250-GISEL: ; %bb.0: ; %entry ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8788,6 +9723,85 @@ define <2 x i32> @clpeak_imad_pat_v2i32_x2(<2 x i32> %x, <2 x i32> %y) { ; GFX1250-SDAG-NEXT: v_mad_u32 v1, v3, v1, v3 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; +; GFX1250-GISEL-FAKE16-LABEL: clpeak_imad_pat_v2i32_x2: +; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_dual_add_nc_u32 v0, 1, v0 :: v_dual_add_nc_u32 v1, 1, v1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v4, v0, v2 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v5, v1, v3 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_dual_add_nc_u32 v0, v4, v0 :: v_dual_add_nc_u32 v1, v5, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v1, v1, v3 +; GFX1250-GISEL-FAKE16-NEXT: v_dual_add_nc_u32 v2, 1, v4 :: v_dual_add_nc_u32 v3, 1, v5 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v4, v0, v2 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v5, v1, v3 +; GFX1250-GISEL-FAKE16-NEXT: v_dual_add_nc_u32 v2, v4, v2 :: v_dual_add_nc_u32 v3, v5, v3 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v0, v2, v0 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v1, v3, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_dual_add_nc_u32 v2, 1, v4 :: v_dual_add_nc_u32 v3, 1, v5 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v4, v0, v2 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v5, v1, v3 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_dual_add_nc_u32 v2, v4, v2 :: v_dual_add_nc_u32 v3, v5, v3 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v0, v2, v0 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v1, v3, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_dual_add_nc_u32 v2, 1, v4 :: v_dual_add_nc_u32 v3, 1, v5 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v2, v0, v2 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX1250-GISEL-FAKE16-NEXT: v_dual_add_nc_u32 v0, 1, v0 :: v_dual_add_nc_u32 v1, 1, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v0, v2, v0 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u32 v1, v3, v1 +; GFX1250-GISEL-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-REAL16-LABEL: clpeak_imad_pat_v2i32_x2: +; GFX1250-GISEL-REAL16: ; %bb.0: ; %entry +; GFX1250-GISEL-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: v_dual_add_nc_u32 v0, 1, v0 :: v_dual_add_nc_u32 v1, 1, v1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v4, v0, v2 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v5, v1, v3 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_dual_add_nc_u32 v0, v4, v0 :: v_dual_add_nc_u32 v1, v5, v1 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v1, v1, v3 +; GFX1250-GISEL-REAL16-NEXT: v_dual_add_nc_u32 v2, 1, v4 :: v_dual_add_nc_u32 v3, 1, v5 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v4, v0, v2 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v5, v1, v3 +; GFX1250-GISEL-REAL16-NEXT: v_dual_add_nc_u32 v2, v4, v2 :: v_dual_add_nc_u32 v3, v5, v3 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v0, v2, v0 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v1, v3, v1 +; GFX1250-GISEL-REAL16-NEXT: v_dual_add_nc_u32 v2, 1, v4 :: v_dual_add_nc_u32 v3, 1, v5 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v4, v0, v2 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v5, v1, v3 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_dual_add_nc_u32 v2, v4, v2 :: v_dual_add_nc_u32 v3, v5, v3 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v0, v2, v0 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v1, v3, v1 +; GFX1250-GISEL-REAL16-NEXT: v_dual_add_nc_u32 v2, 1, v4 :: v_dual_add_nc_u32 v3, 1, v5 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v2, v0, v2 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX1250-GISEL-REAL16-NEXT: v_dual_add_nc_u32 v0, 1, v0 :: v_dual_add_nc_u32 v1, 1, v1 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v0, v2, v0 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u32 v1, v3, v1 +; GFX1250-GISEL-REAL16-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-GISEL-LABEL: clpeak_imad_pat_v2i32_x2: ; GFX1250-GISEL: ; %bb.0: ; %entry ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9190,25 +10204,99 @@ define signext i16 @clpeak_imad_pat_i16_x2(i16 signext %x, i16 signext %y) { ; GFX1200-GISEL-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-SDAG-LABEL: clpeak_imad_pat_i16_x2: -; GFX1250-SDAG: ; %bb.0: ; %entry -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX1250-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX1250-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX1250-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX1250-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] -; +; GFX1250-SDAG-FAKE16-LABEL: clpeak_imad_pat_i16_x2: +; GFX1250-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX1250-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1250-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-FAKE16-LABEL: clpeak_imad_pat_i16_x2: +; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1250-GISEL-FAKE16-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1250-GISEL-FAKE16-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1250-GISEL-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-SDAG-REAL16-LABEL: clpeak_imad_pat_i16_x2: +; GFX1250-SDAG-REAL16: ; %bb.0: ; %entry +; GFX1250-SDAG-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l +; GFX1250-SDAG-REAL16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1250-SDAG-REAL16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-REAL16-LABEL: clpeak_imad_pat_i16_x2: +; GFX1250-GISEL-REAL16: ; %bb.0: ; %entry +; GFX1250-GISEL-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX1250-GISEL-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1250-GISEL-REAL16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX1250-GISEL-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1250-GISEL-REAL16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX1250-GISEL-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1250-GISEL-REAL16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1250-GISEL-REAL16-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-GISEL-LABEL: clpeak_imad_pat_i16_x2: ; GFX1250-GISEL: ; %bb.0: ; %entry ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9592,25 +10680,98 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) { ; GFX1200-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-SDAG-LABEL: clpeak_umad_pat_i16_x2: -; GFX1250-SDAG: ; %bb.0: ; %entry -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX1250-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX1250-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX1250-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] -; +; GFX1250-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16_x2: +; GFX1250-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX1250-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16_x2: +; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1250-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1250-GISEL-FAKE16-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1250-GISEL-FAKE16-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1 +; GFX1250-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-GISEL-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-SDAG-REAL16-LABEL: clpeak_umad_pat_i16_x2: +; GFX1250-SDAG-REAL16: ; %bb.0: ; %entry +; GFX1250-SDAG-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l +; GFX1250-SDAG-REAL16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1250-SDAG-REAL16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-REAL16-LABEL: clpeak_umad_pat_i16_x2: +; GFX1250-GISEL-REAL16: ; %bb.0: ; %entry +; GFX1250-GISEL-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX1250-GISEL-REAL16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX1250-GISEL-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1250-GISEL-REAL16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX1250-GISEL-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1250-GISEL-REAL16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX1250-GISEL-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1250-GISEL-REAL16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX1250-GISEL-REAL16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h +; GFX1250-GISEL-REAL16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1250-GISEL-REAL16-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-GISEL-LABEL: clpeak_umad_pat_i16_x2: ; GFX1250-GISEL: ; %bb.0: ; %entry ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10597,16 +11758,46 @@ define <2 x i16> @multi_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z0, i16 %z1) { ; GFX1200-GISEL-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-SDAG-LABEL: multi_use_mul_mad_i16_var: -; GFX1250-SDAG: ; %bb.0: ; %entry -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: v_mad_u16 v2, v0, v1, v2 -; GFX1250-SDAG-NEXT: v_mad_u16 v0, v0, v1, v3 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 -; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] -; +; GFX1250-SDAG-FAKE16-LABEL: multi_use_mul_mad_i16_var: +; GFX1250-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v2, v0, v1, v2 +; GFX1250-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v3 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX1250-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-FAKE16-LABEL: multi_use_mul_mad_i16_var: +; GFX1250-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-FAKE16-NEXT: v_mad_u16 v2, v0, v1, v2 +; GFX1250-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v3 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX1250-GISEL-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX1250-GISEL-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-SDAG-REAL16-LABEL: multi_use_mul_mad_i16_var: +; GFX1250-SDAG-REAL16: ; %bb.0: ; %entry +; GFX1250-SDAG-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-REAL16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v2.l +; GFX1250-SDAG-REAL16-NEXT: v_mad_u16 v0.h, v0.h, v1.l, v3.l +; GFX1250-SDAG-REAL16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-REAL16-LABEL: multi_use_mul_mad_i16_var: +; GFX1250-GISEL-REAL16: ; %bb.0: ; %entry +; GFX1250-GISEL-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-REAL16-NEXT: v_mad_u16 v2.l, v0.l, v1.l, v2.l +; GFX1250-GISEL-REAL16-NEXT: v_mad_u16 v2.h, v0.l, v1.l, v3.l +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_mov_b32_e32 v0, v2 +; GFX1250-GISEL-REAL16-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-GISEL-LABEL: multi_use_mul_mad_i16_var: ; GFX1250-GISEL: ; %bb.0: ; %entry ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -10835,15 +12026,25 @@ define i16 @other_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z, ptr addrspace(3) % ; GFX1200-GISEL-FAKE16-NEXT: s_wait_dscnt 0x0 ; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: other_use_mul_mad_i16_var: -; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mul_lo_u16 v4, v0, v1 -; GFX1250-NEXT: v_mad_u16 v0, v0, v1, v2 -; GFX1250-NEXT: ds_store_b16 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: other_use_mul_mad_i16_var: +; GFX1250-FAKE16: ; %bb.0: ; %entry +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mul_lo_u16 v4, v0, v1 +; GFX1250-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v2 +; GFX1250-FAKE16-NEXT: ds_store_b16 v3, v4 +; GFX1250-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: other_use_mul_mad_i16_var: +; GFX1250-REAL16: ; %bb.0: ; %entry +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_mul_lo_u16 v0.h, v0.l, v1.l +; GFX1250-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v2.l +; GFX1250-REAL16-NEXT: ds_store_b16_d16_hi v3, v0 +; GFX1250-REAL16-NEXT: s_wait_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] entry: %mul = mul i16 %x, %y %add0 = add i16 %mul, %z diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.bf16.ll index 091859f3c9bf3..b75531748758b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.bf16.ll @@ -1,29 +1,83 @@ -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s -; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,FAKE16 %s +; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,FAKE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,REAL16 %s +; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,REAL16 %s ; FIXME: GlobalISel does not work with bf16 declare bfloat @llvm.amdgcn.cos.bf16(bfloat) #0 -; GCN-LABEL: {{^}}cos_bf16: -; GCN: v_cos_bf16_e32 {{v[0-9]+}}, {{s[0-9]+}} define amdgpu_kernel void @cos_bf16(ptr addrspace(1) %out, bfloat %src) #1 { +; FAKE16-LABEL: cos_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: v_cos_bf16_e32 v0, s2 +; FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: cos_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: v_cos_bf16_e32 v0.l, s2 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %cos = call bfloat @llvm.amdgcn.cos.bf16(bfloat %src) #0 store bfloat %cos, ptr addrspace(1) %out, align 2 ret void } -; GCN-LABEL: {{^}}cos_bf16_constant_4 -; GCN: v_cos_bf16_e32 v0, 4.0 define amdgpu_kernel void @cos_bf16_constant_4(ptr addrspace(1) %out) #1 { +; FAKE16-LABEL: cos_bf16_constant_4: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_cos_bf16_e32 v0, 4.0 +; FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: cos_bf16_constant_4: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_cos_bf16_e32 v0.l, 4.0 +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %cos = call bfloat @llvm.amdgcn.cos.bf16(bfloat 4.0) #0 store bfloat %cos, ptr addrspace(1) %out, align 2 ret void } -; GCN-LABEL: {{^}}cos_bf16_constant_100 -; GCN: v_cos_bf16_e32 {{v[0-9]+}}, 0x42c8 define amdgpu_kernel void @cos_bf16_constant_100(ptr addrspace(1) %out) #1 { +; FAKE16-LABEL: cos_bf16_constant_100: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_cos_bf16_e32 v0, 0x42c8 +; FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: cos_bf16_constant_100: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_cos_bf16_e32 v0.l, 0x42c8 +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %cos = call bfloat @llvm.amdgcn.cos.bf16(bfloat 100.0) #0 store bfloat %cos, ptr addrspace(1) %out, align 2 ret void @@ -31,3 +85,5 @@ define amdgpu_kernel void @cos_bf16_constant_100(ptr addrspace(1) %out) #1 { attributes #0 = { nounwind readnone } attributes #1 = { nounwind } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll index 4d853e914cdda..241baf968491b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck -check-prefixes=GFX1170PLUS,GFX1170 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX1170PLUS,GFX12 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1170PLUS,GFX1250 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1170PLUS,GFX1250,GFX1250-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1170PLUS,GFX1250,GFX1250-REAL16 %s define amdgpu_cs float @test_cvt_f32_bf8_byte0(i32 %a) { ; GFX1170-LABEL: test_cvt_f32_bf8_byte0: @@ -104,13 +105,23 @@ define amdgpu_cs void @test_cvt_pk_bf8_f32_word0(i32 %a, float %y, i32 %old, ptr ; GFX12-NEXT: global_store_b32 v[3:4], v2, off ; GFX12-NEXT: s_endpgm ; -; GFX1250-LABEL: test_cvt_pk_bf8_f32_word0: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 -; GFX1250-NEXT: v_cvt_pk_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1250-NEXT: global_store_b32 v[4:5], v2, off -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: test_cvt_pk_bf8_f32_word0: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-FAKE16-NEXT: v_cvt_pk_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1250-FAKE16-NEXT: global_store_b32 v[4:5], v2, off +; GFX1250-FAKE16-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: test_cvt_pk_bf8_f32_word0: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1250-REAL16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-REAL16-NEXT: v_cvt_pk_bf8_f32 v2.l, v0, v1 +; GFX1250-REAL16-NEXT: global_store_b32 v[4:5], v2, off +; GFX1250-REAL16-NEXT: s_endpgm %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) %tmp1 = bitcast i32 %tmp0 to float %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %tmp1, float %y, i32 %old, i1 false) @@ -135,15 +146,25 @@ define amdgpu_cs void @test_cvt_pk_fp8_f32_word1(i32 %a, float %y, i32 %old, ptr ; GFX12-NEXT: global_store_b32 v[3:4], v2, off ; GFX12-NEXT: s_endpgm ; -; GFX1250-LABEL: test_cvt_pk_fp8_f32_word1: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1250-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] -; GFX1250-NEXT: global_store_b32 v[4:5], v2, off -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: test_cvt_pk_fp8_f32_word1: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-FAKE16-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1] +; GFX1250-FAKE16-NEXT: global_store_b32 v[4:5], v2, off +; GFX1250-FAKE16-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: test_cvt_pk_fp8_f32_word1: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1250-REAL16-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-REAL16-NEXT: v_cvt_pk_fp8_f32 v2.h, v0, v1 +; GFX1250-REAL16-NEXT: global_store_b32 v[4:5], v2, off +; GFX1250-REAL16-NEXT: s_endpgm %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) %tmp1 = bitcast i32 %tmp0 to float %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %tmp1, float %y, i32 %old, i1 true) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.bf16.ll index 6304923790ad5..a85676a7ac7df 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.bf16.ll @@ -1,29 +1,83 @@ -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s -; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,FAKE16 %s +; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,FAKE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,REAL16 %s +; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,REAL16 %s ; FIXME: GlobalISel does not work with bf16 declare bfloat @llvm.amdgcn.exp2.bf16(bfloat) #0 -; GCN-LABEL: {{^}}exp_bf16: -; GCN: v_exp_bf16_e32 {{v[0-9]+}}, {{s[0-9]+}} define amdgpu_kernel void @exp_bf16(ptr addrspace(1) %out, bfloat %src) #1 { +; FAKE16-LABEL: exp_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: v_exp_bf16_e32 v0, s2 +; FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: exp_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: v_exp_bf16_e32 v0.l, s2 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %exp = call bfloat @llvm.amdgcn.exp2.bf16(bfloat %src) #0 store bfloat %exp, ptr addrspace(1) %out, align 2 ret void } -; GCN-LABEL: {{^}}exp_bf16_constant_4 -; GCN: v_exp_bf16_e32 v0, 4.0 define amdgpu_kernel void @exp_bf16_constant_4(ptr addrspace(1) %out) #1 { +; FAKE16-LABEL: exp_bf16_constant_4: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_exp_bf16_e32 v0, 4.0 +; FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: exp_bf16_constant_4: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_exp_bf16_e32 v0.l, 4.0 +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %exp = call bfloat @llvm.amdgcn.exp2.bf16(bfloat 4.0) #0 store bfloat %exp, ptr addrspace(1) %out, align 2 ret void } -; GCN-LABEL: {{^}}exp_bf16_constant_100 -; GCN: v_exp_bf16_e32 {{v[0-9]+}}, 0x42c8 define amdgpu_kernel void @exp_bf16_constant_100(ptr addrspace(1) %out) #1 { +; FAKE16-LABEL: exp_bf16_constant_100: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_exp_bf16_e32 v0, 0x42c8 +; FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: exp_bf16_constant_100: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_exp_bf16_e32 v0.l, 0x42c8 +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %exp = call bfloat @llvm.amdgcn.exp2.bf16(bfloat 100.0) #0 store bfloat %exp, ptr addrspace(1) %out, align 2 ret void @@ -31,3 +85,5 @@ define amdgpu_kernel void @exp_bf16_constant_100(ptr addrspace(1) %out) #1 { attributes #0 = { nounwind readnone } attributes #1 = { nounwind } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.bf16.ll index a8b2077f5a35b..ecdcaa9b3026d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.bf16.ll @@ -1,29 +1,83 @@ -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s -; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,FAKE16 %s +; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,FAKE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,REAL16 %s +; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,REAL16 %s ; FIXME: GlobalISel does not work with bf16 declare bfloat @llvm.amdgcn.log.bf16(bfloat) #0 -; GCN-LABEL: {{^}}log_bf16: -; GCN: v_log_bf16_e32 {{v[0-9]+}}, {{s[0-9]+}} define amdgpu_kernel void @log_bf16(ptr addrspace(1) %out, bfloat %src) #1 { +; FAKE16-LABEL: log_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: v_log_bf16_e32 v0, s2 +; FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: log_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: v_log_bf16_e32 v0.l, s2 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %log = call bfloat @llvm.amdgcn.log.bf16(bfloat %src) #0 store bfloat %log, ptr addrspace(1) %out, align 2 ret void } -; GCN-LABEL: {{^}}log_bf16_constant_4 -; GCN: v_log_bf16_e32 v0, 4.0 define amdgpu_kernel void @log_bf16_constant_4(ptr addrspace(1) %out) #1 { +; FAKE16-LABEL: log_bf16_constant_4: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_log_bf16_e32 v0, 4.0 +; FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: log_bf16_constant_4: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_log_bf16_e32 v0.l, 4.0 +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %log = call bfloat @llvm.amdgcn.log.bf16(bfloat 4.0) #0 store bfloat %log, ptr addrspace(1) %out, align 2 ret void } -; GCN-LABEL: {{^}}log_bf16_constant_100 -; GCN: v_log_bf16_e32 {{v[0-9]+}}, 0x42c8 define amdgpu_kernel void @log_bf16_constant_100(ptr addrspace(1) %out) #1 { +; FAKE16-LABEL: log_bf16_constant_100: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_log_bf16_e32 v0, 0x42c8 +; FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: log_bf16_constant_100: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_log_bf16_e32 v0.l, 0x42c8 +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %log = call bfloat @llvm.amdgcn.log.bf16(bfloat 100.0) #0 store bfloat %log, ptr addrspace(1) %out, align 2 ret void @@ -31,3 +85,5 @@ define amdgpu_kernel void @log_bf16_constant_100(ptr addrspace(1) %out) #1 { attributes #0 = { nounwind readnone } attributes #1 = { nounwind } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll index 239afdd389791..daa0df045f72e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll @@ -1,8 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -stop-after=amdgpu-isel < %s | FileCheck %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -stop-after=amdgpu-isel < %s | FileCheck --check-prefix=CHECK45 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck --check-prefix=CHECK45 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -stop-after=amdgpu-isel < %s | FileCheck --check-prefixes=CHECK45,FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck --check-prefixes=CHECK45,FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -stop-after=amdgpu-isel < %s | FileCheck --check-prefixes=CHECK45,REAL16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck --check-prefixes=CHECK45,REAL16 %s define amdgpu_ps ptr addrspace(8) @basic_raw_buffer(ptr inreg %p) { ; CHECK-LABEL: name: basic_raw_buffer @@ -396,58 +398,113 @@ define amdgpu_ps float @general_case_load_with_waterfall(ptr %p, i16 %stride, i6 ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; - ; CHECK45-LABEL: name: general_case_load_with_waterfall - ; CHECK45: bb.0 (%ir-block.0): - ; CHECK45-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; CHECK45-NEXT: {{ $}} - ; CHECK45-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5 - ; CHECK45-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; CHECK45-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK45-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; CHECK45-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK45-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK45-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 - ; CHECK45-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; CHECK45-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 28 - ; CHECK45-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[S_MOV_B32_]], [[COPY]], implicit $exec - ; CHECK45-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK45-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, killed [[V_LSHLREV_B32_e64_]], %subreg.sub1 - ; CHECK45-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub1 - ; CHECK45-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 7 - ; CHECK45-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; CHECK45-NEXT: [[V_LSHRREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHRREV_B64_e64 killed [[S_MOV_B32_1]], [[COPY7]], implicit $exec - ; CHECK45-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[V_LSHRREV_B64_e64_]].sub1 - ; CHECK45-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] - ; CHECK45-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 12 - ; CHECK45-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[S_MOV_B32_2]], killed [[COPY9]], implicit $exec - ; CHECK45-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, killed [[V_LSHLREV_B32_e64_1]], %subreg.sub1 - ; CHECK45-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE3]].sub1 - ; CHECK45-NEXT: [[V_OR3_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR3_B32_e64 killed [[COPY8]], killed [[COPY10]], killed [[COPY6]], implicit $exec - ; CHECK45-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 - ; CHECK45-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[V_LSHRREV_B64_e64_]].sub0 - ; CHECK45-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE3]].sub0 - ; CHECK45-NEXT: [[V_OR3_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR3_B32_e64 killed [[COPY12]], killed [[COPY13]], killed [[COPY11]], implicit $exec - ; CHECK45-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_OR3_B32_e64_1]], %subreg.sub0, killed [[V_OR3_B32_e64_]], %subreg.sub1 - ; CHECK45-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub1 - ; CHECK45-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub0 - ; CHECK45-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 - ; CHECK45-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 - ; CHECK45-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 25 - ; CHECK45-NEXT: [[V_LSHLREV_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[S_MOV_B32_3]], killed [[COPY17]], implicit $exec - ; CHECK45-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, killed [[V_LSHLREV_B32_e64_2]], %subreg.sub1 - ; CHECK45-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE5]].sub1 - ; CHECK45-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 killed [[COPY16]], killed [[COPY18]], implicit $exec - ; CHECK45-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 - ; CHECK45-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE5]].sub0 - ; CHECK45-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 killed [[COPY19]], killed [[COPY20]], implicit $exec - ; CHECK45-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_OR_B32_e64_1]], %subreg.sub0, killed [[V_OR_B32_e64_]], %subreg.sub1 - ; CHECK45-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE6]].sub1 - ; CHECK45-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE6]].sub0 - ; CHECK45-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE killed [[COPY22]], %subreg.sub0, killed [[COPY21]], %subreg.sub1, killed [[COPY15]], %subreg.sub2, killed [[COPY14]], %subreg.sub3 - ; CHECK45-NEXT: [[COPY23:%[0-9]+]]:sgpr_128 = COPY [[REG_SEQUENCE7]] - ; CHECK45-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN [[V_MOV_B32_e32_]], killed [[COPY23]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) - ; CHECK45-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_IDXEN]] - ; CHECK45-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; FAKE16-LABEL: name: general_case_load_with_waterfall + ; FAKE16: bb.0 (%ir-block.0): + ; FAKE16-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; FAKE16-NEXT: {{ $}} + ; FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; FAKE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; FAKE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; FAKE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; FAKE16-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; FAKE16-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; FAKE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; FAKE16-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 28 + ; FAKE16-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[S_MOV_B32_]], [[COPY]], implicit $exec + ; FAKE16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; FAKE16-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, killed [[V_LSHLREV_B32_e64_]], %subreg.sub1 + ; FAKE16-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub1 + ; FAKE16-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 7 + ; FAKE16-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; FAKE16-NEXT: [[V_LSHRREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHRREV_B64_e64 killed [[S_MOV_B32_1]], [[COPY7]], implicit $exec + ; FAKE16-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[V_LSHRREV_B64_e64_]].sub1 + ; FAKE16-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; FAKE16-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 12 + ; FAKE16-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[S_MOV_B32_2]], killed [[COPY9]], implicit $exec + ; FAKE16-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, killed [[V_LSHLREV_B32_e64_1]], %subreg.sub1 + ; FAKE16-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE3]].sub1 + ; FAKE16-NEXT: [[V_OR3_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR3_B32_e64 killed [[COPY8]], killed [[COPY10]], killed [[COPY6]], implicit $exec + ; FAKE16-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 + ; FAKE16-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[V_LSHRREV_B64_e64_]].sub0 + ; FAKE16-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE3]].sub0 + ; FAKE16-NEXT: [[V_OR3_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR3_B32_e64 killed [[COPY12]], killed [[COPY13]], killed [[COPY11]], implicit $exec + ; FAKE16-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_OR3_B32_e64_1]], %subreg.sub0, killed [[V_OR3_B32_e64_]], %subreg.sub1 + ; FAKE16-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub1 + ; FAKE16-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub0 + ; FAKE16-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; FAKE16-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; FAKE16-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 25 + ; FAKE16-NEXT: [[V_LSHLREV_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[S_MOV_B32_3]], killed [[COPY17]], implicit $exec + ; FAKE16-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, killed [[V_LSHLREV_B32_e64_2]], %subreg.sub1 + ; FAKE16-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE5]].sub1 + ; FAKE16-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 killed [[COPY16]], killed [[COPY18]], implicit $exec + ; FAKE16-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; FAKE16-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE5]].sub0 + ; FAKE16-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 killed [[COPY19]], killed [[COPY20]], implicit $exec + ; FAKE16-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_OR_B32_e64_1]], %subreg.sub0, killed [[V_OR_B32_e64_]], %subreg.sub1 + ; FAKE16-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE6]].sub1 + ; FAKE16-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE6]].sub0 + ; FAKE16-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE killed [[COPY22]], %subreg.sub0, killed [[COPY21]], %subreg.sub1, killed [[COPY15]], %subreg.sub2, killed [[COPY14]], %subreg.sub3 + ; FAKE16-NEXT: [[COPY23:%[0-9]+]]:sgpr_128 = COPY [[REG_SEQUENCE7]] + ; FAKE16-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN [[V_MOV_B32_e32_]], killed [[COPY23]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; FAKE16-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_IDXEN]] + ; FAKE16-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; + ; REAL16-LABEL: name: general_case_load_with_waterfall + ; REAL16: bb.0 (%ir-block.0): + ; REAL16-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; REAL16-NEXT: {{ $}} + ; REAL16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; REAL16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; REAL16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; REAL16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; REAL16-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; REAL16-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; REAL16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; REAL16-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; REAL16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 28 + ; REAL16-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[S_MOV_B32_]], [[COPY]], implicit $exec + ; REAL16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; REAL16-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, killed [[V_LSHLREV_B32_e64_]], %subreg.sub1 + ; REAL16-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub1 + ; REAL16-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 7 + ; REAL16-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; REAL16-NEXT: [[V_LSHRREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHRREV_B64_e64 killed [[S_MOV_B32_1]], [[COPY7]], implicit $exec + ; REAL16-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[V_LSHRREV_B64_e64_]].sub1 + ; REAL16-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; REAL16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; REAL16-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[COPY3]], %subreg.lo16, killed [[DEF]], %subreg.hi16 + ; REAL16-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 12 + ; REAL16-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[S_MOV_B32_2]], killed [[REG_SEQUENCE3]], implicit $exec + ; REAL16-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, killed [[V_LSHLREV_B32_e64_1]], %subreg.sub1 + ; REAL16-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub1 + ; REAL16-NEXT: [[V_OR3_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR3_B32_e64 killed [[COPY8]], killed [[COPY9]], killed [[COPY6]], implicit $exec + ; REAL16-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 + ; REAL16-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_LSHRREV_B64_e64_]].sub0 + ; REAL16-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub0 + ; REAL16-NEXT: [[V_OR3_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR3_B32_e64 killed [[COPY11]], killed [[COPY12]], killed [[COPY10]], implicit $exec + ; REAL16-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_OR3_B32_e64_1]], %subreg.sub0, killed [[V_OR3_B32_e64_]], %subreg.sub1 + ; REAL16-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE5]].sub1 + ; REAL16-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE5]].sub0 + ; REAL16-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; REAL16-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; REAL16-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 25 + ; REAL16-NEXT: [[V_LSHLREV_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[S_MOV_B32_3]], killed [[COPY16]], implicit $exec + ; REAL16-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, killed [[V_LSHLREV_B32_e64_2]], %subreg.sub1 + ; REAL16-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE6]].sub1 + ; REAL16-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 killed [[COPY15]], killed [[COPY17]], implicit $exec + ; REAL16-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; REAL16-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE6]].sub0 + ; REAL16-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 killed [[COPY18]], killed [[COPY19]], implicit $exec + ; REAL16-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_OR_B32_e64_1]], %subreg.sub0, killed [[V_OR_B32_e64_]], %subreg.sub1 + ; REAL16-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE7]].sub1 + ; REAL16-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE7]].sub0 + ; REAL16-NEXT: [[REG_SEQUENCE8:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE killed [[COPY21]], %subreg.sub0, killed [[COPY20]], %subreg.sub1, killed [[COPY14]], %subreg.sub2, killed [[COPY13]], %subreg.sub3 + ; REAL16-NEXT: [[COPY22:%[0-9]+]]:sgpr_128 = COPY [[REG_SEQUENCE8]] + ; REAL16-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN [[V_MOV_B32_e32_]], killed [[COPY22]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; REAL16-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_IDXEN]] + ; REAL16-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 %stride, i64 %numVals, i32 %flags) %value = call float @llvm.amdgcn.struct.ptr.buffer.load(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) ret float %value diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.bf16.ll index 9c35a7eae0b8e..a5a2799789dc0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.bf16.ll @@ -1,29 +1,83 @@ -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s -; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,FAKE16 %s +; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,FAKE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,REAL16 %s +; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,REAL16 %s ; FIXME: GlobalISel does not work with bf16 declare bfloat @llvm.amdgcn.sin.bf16(bfloat) #0 -; GCN-LABEL: {{^}}sin_bf16: -; GCN: v_sin_bf16_e32 {{v[0-9]+}}, {{s[0-9]+}} define amdgpu_kernel void @sin_bf16(ptr addrspace(1) %out, bfloat %src) #1 { +; FAKE16-LABEL: sin_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: v_sin_bf16_e32 v0, s2 +; FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: sin_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: v_sin_bf16_e32 v0.l, s2 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %sin = call bfloat @llvm.amdgcn.sin.bf16(bfloat %src) #0 store bfloat %sin, ptr addrspace(1) %out, align 2 ret void } -; GCN-LABEL: {{^}}sin_bf16_constant_4 -; GCN: v_sin_bf16_e32 v0, 4.0 define amdgpu_kernel void @sin_bf16_constant_4(ptr addrspace(1) %out) #1 { +; FAKE16-LABEL: sin_bf16_constant_4: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_sin_bf16_e32 v0, 4.0 +; FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: sin_bf16_constant_4: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_sin_bf16_e32 v0.l, 4.0 +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %sin = call bfloat @llvm.amdgcn.sin.bf16(bfloat 4.0) #0 store bfloat %sin, ptr addrspace(1) %out, align 2 ret void } -; GCN-LABEL: {{^}}sin_bf16_constant_100 -; GCN: v_sin_bf16_e32 {{v[0-9]+}}, 0x42c8 define amdgpu_kernel void @sin_bf16_constant_100(ptr addrspace(1) %out) #1 { +; FAKE16-LABEL: sin_bf16_constant_100: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_sin_bf16_e32 v0, 0x42c8 +; FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: sin_bf16_constant_100: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_sin_bf16_e32 v0.l, 0x42c8 +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %sin = call bfloat @llvm.amdgcn.sin.bf16(bfloat 100.0) #0 store bfloat %sin, ptr addrspace(1) %out, align 2 ret void @@ -31,3 +85,5 @@ define amdgpu_kernel void @sin_bf16_constant_100(ptr addrspace(1) %out) #1 { attributes #0 = { nounwind readnone } attributes #1 = { nounwind } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sqrt.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sqrt.bf16.ll index 5287b5dba848f..04bda9c8595ca 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sqrt.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sqrt.bf16.ll @@ -1,29 +1,83 @@ -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s -; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,FAKE16 %s +; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,FAKE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,REAL16 %s +; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,REAL16 %s ; FIXME: GlobalISel does not work with bf16 declare bfloat @llvm.amdgcn.sqrt.bf16(bfloat) #0 -; GCN-LABEL: {{^}}sqrt_bf16: -; GCN: v_sqrt_bf16_e32 {{v[0-9]+}}, {{s[0-9]+}} define amdgpu_kernel void @sqrt_bf16(ptr addrspace(1) %out, bfloat %src) #1 { +; FAKE16-LABEL: sqrt_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: v_sqrt_bf16_e32 v0, s2 +; FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: sqrt_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: v_sqrt_bf16_e32 v0.l, s2 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %sqrt = call bfloat @llvm.amdgcn.sqrt.bf16(bfloat %src) #0 store bfloat %sqrt, ptr addrspace(1) %out, align 2 ret void } -; GCN-LABEL: {{^}}sqrt_bf16_constant_4 -; GCN: v_sqrt_bf16_e32 v0, 4.0 define amdgpu_kernel void @sqrt_bf16_constant_4(ptr addrspace(1) %out) #1 { +; FAKE16-LABEL: sqrt_bf16_constant_4: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_sqrt_bf16_e32 v0, 4.0 +; FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: sqrt_bf16_constant_4: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_sqrt_bf16_e32 v0.l, 4.0 +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %sqrt = call bfloat @llvm.amdgcn.sqrt.bf16(bfloat 4.0) #0 store bfloat %sqrt, ptr addrspace(1) %out, align 2 ret void } -; GCN-LABEL: {{^}}sqrt_bf16_constant_100 -; GCN: v_sqrt_bf16_e32 {{v[0-9]+}}, 0x42c8 define amdgpu_kernel void @sqrt_bf16_constant_100(ptr addrspace(1) %out) #1 { +; FAKE16-LABEL: sqrt_bf16_constant_100: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_sqrt_bf16_e32 v0, 0x42c8 +; FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: sqrt_bf16_constant_100: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_sqrt_bf16_e32 v0.l, 0x42c8 +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %sqrt = call bfloat @llvm.amdgcn.sqrt.bf16(bfloat 100.0) #0 store bfloat %sqrt, ptr addrspace(1) %out, align 2 ret void @@ -31,3 +85,5 @@ define amdgpu_kernel void @sqrt_bf16_constant_100(ptr addrspace(1) %out) #1 { attributes #0 = { nounwind readnone } attributes #1 = { nounwind } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll index 7cd0cd16e9396..82d8ff8e3b874 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s --check-prefix=GFX1250 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -global-isel -global-isel-abort=2 < %s | FileCheck %s --check-prefix=GISEL +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck %s --check-prefixes=GFX1250,GFX1250-FAKE16 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -global-isel -global-isel-abort=2 < %s | FileCheck %s --check-prefixes=GISEL,GISEL-FAKE16 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck %s --check-prefixes=GFX1250,GFX1250-REAL16 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -global-isel -global-isel-abort=2 < %s | FileCheck %s --check-prefixes=GISEL,GISEL-REAL16 define amdgpu_ps void @test_wmma_f32_16x16x4_f32(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f32_16x16x4_f32: @@ -2789,15 +2791,15 @@ bb: } define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_fp8(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_fp8: -; GFX1250: ; %bb.0: ; %bb -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 -; GFX1250-NEXT: v_mov_b32_e32 v29, 0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse -; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: test_swmmac_f16_16x16x128_fp8_fp8: +; GFX1250-FAKE16: ; %bb.0: ; %bb +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v29, 0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse +; GFX1250-FAKE16-NEXT: global_store_b128 v[30:31], v[24:27], off +; GFX1250-FAKE16-NEXT: s_endpgm ; ; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_fp8: ; GISEL: ; %bb.0: ; %bb @@ -2806,6 +2808,14 @@ define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_fp8(<8 x i32> %A, <16 x i32 ; GISEL-NEXT: v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30 ; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off ; GISEL-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: test_swmmac_f16_16x16x128_fp8_fp8: +; GFX1250-REAL16: ; %bb.0: ; %bb +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse +; GFX1250-REAL16-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 +; GFX1250-REAL16-NEXT: global_store_b128 v[30:31], v[24:27], off +; GFX1250-REAL16-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.fp8.v8f16.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, i1 false, i1 true) store <8 x half> %res, ptr addrspace(1) %out @@ -2813,15 +2823,15 @@ bb: } define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_bf8(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_bf8: -; GFX1250: ; %bb.0: ; %bb -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 -; GFX1250-NEXT: v_mov_b32_e32 v29, 0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse -; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: test_swmmac_f16_16x16x128_fp8_bf8: +; GFX1250-FAKE16: ; %bb.0: ; %bb +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v29, 0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse +; GFX1250-FAKE16-NEXT: global_store_b128 v[30:31], v[24:27], off +; GFX1250-FAKE16-NEXT: s_endpgm ; ; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_bf8: ; GISEL: ; %bb.0: ; %bb @@ -2830,6 +2840,14 @@ define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_bf8(<8 x i32> %A, <16 x i32 ; GISEL-NEXT: v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30 ; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off ; GISEL-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: test_swmmac_f16_16x16x128_fp8_bf8: +; GFX1250-REAL16: ; %bb.0: ; %bb +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse +; GFX1250-REAL16-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 +; GFX1250-REAL16-NEXT: global_store_b128 v[30:31], v[24:27], off +; GFX1250-REAL16-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.bf8.v8f16.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, i1 false, i1 true) store <8 x half> %res, ptr addrspace(1) %out @@ -2837,15 +2855,15 @@ bb: } define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_fp8(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_fp8: -; GFX1250: ; %bb.0: ; %bb -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 -; GFX1250-NEXT: v_mov_b32_e32 v29, 0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse -; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: test_swmmac_f16_16x16x128_bf8_fp8: +; GFX1250-FAKE16: ; %bb.0: ; %bb +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v29, 0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse +; GFX1250-FAKE16-NEXT: global_store_b128 v[30:31], v[24:27], off +; GFX1250-FAKE16-NEXT: s_endpgm ; ; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_fp8: ; GISEL: ; %bb.0: ; %bb @@ -2854,6 +2872,14 @@ define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_fp8(<8 x i32> %A, <16 x i32 ; GISEL-NEXT: v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30 ; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off ; GISEL-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: test_swmmac_f16_16x16x128_bf8_fp8: +; GFX1250-REAL16: ; %bb.0: ; %bb +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse +; GFX1250-REAL16-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 +; GFX1250-REAL16-NEXT: global_store_b128 v[30:31], v[24:27], off +; GFX1250-REAL16-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.fp8.v8f16.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, i1 false, i1 true) store <8 x half> %res, ptr addrspace(1) %out @@ -2861,15 +2887,15 @@ bb: } define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_bf8(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { -; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_bf8: -; GFX1250: ; %bb.0: ; %bb -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 -; GFX1250-NEXT: v_mov_b32_e32 v29, 0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse -; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: test_swmmac_f16_16x16x128_bf8_bf8: +; GFX1250-FAKE16: ; %bb.0: ; %bb +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v29, 0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse +; GFX1250-FAKE16-NEXT: global_store_b128 v[30:31], v[24:27], off +; GFX1250-FAKE16-NEXT: s_endpgm ; ; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_bf8: ; GISEL: ; %bb.0: ; %bb @@ -2878,6 +2904,14 @@ define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_bf8(<8 x i32> %A, <16 x i32 ; GISEL-NEXT: v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30 ; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off ; GISEL-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: test_swmmac_f16_16x16x128_bf8_bf8: +; GFX1250-REAL16: ; %bb.0: ; %bb +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse +; GFX1250-REAL16-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 +; GFX1250-REAL16-NEXT: global_store_b128 v[30:31], v[24:27], off +; GFX1250-REAL16-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.bf8.v8f16.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, i1 false, i1 true) store <8 x half> %res, ptr addrspace(1) %out @@ -3000,3 +3034,6 @@ declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.bf8.v8f16.v8i32.v16i32. declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i64(i1 immarg, <8 x i32>, i1 immarg, <16 x i32>, <8 x i32>, i64 %Index, i1, i1) declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x float>, i16, i1, i1) declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x half>, i16, i1, i1) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GISEL-FAKE16: {{.*}} +; GISEL-REAL16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll index 8498e64b40d9a..f32abce3cdd86 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll @@ -1,55 +1,91 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 %s -o - | FileCheck -check-prefixes=GCN %s -; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 %s -o - | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,FAKE16 %s +; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,FAKE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,REAL16 %s +; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,REAL16 %s ; FIXME: GlobalISel does not work with bf16 declare bfloat @llvm.cos.bf16(bfloat) #0 define amdgpu_kernel void @cos_bf16(ptr addrspace(1) %out, bfloat %src) #1 { -; GCN-LABEL: cos_bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 nv -; GCN-NEXT: s_mov_b32 s3, 0x3e230000 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: v_fma_mixlo_bf16 v0, s2, s3, 0 op_sel_hi:[1,0,0] -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_cos_bf16_e32 v0, v0 -; GCN-NEXT: global_store_b16 v1, v0, s[0:1] -; GCN-NEXT: s_endpgm +; FAKE16-LABEL: cos_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; FAKE16-NEXT: s_mov_b32 s3, 0x3e230000 +; FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: v_fma_mixlo_bf16 v0, s2, s3, 0 op_sel_hi:[1,0,0] +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; FAKE16-NEXT: v_cos_bf16_e32 v0, v0 +; FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: cos_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; REAL16-NEXT: s_mov_b32 s3, 0x3e230000 +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: v_fma_mix_f32_bf16 v0, s2, s3, neg(0) op_sel_hi:[1,0,0] +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; REAL16-NEXT: v_cos_bf16_e32 v0.l, v0.l +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %cos = call bfloat @llvm.cos.bf16(bfloat %src) #0 store bfloat %cos, ptr addrspace(1) %out, align 2 ret void } define amdgpu_kernel void @cos_bf16_constant_4(ptr addrspace(1) %out) #1 { -; GCN-LABEL: cos_bf16_constant_4: -; GCN: ; %bb.0: -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv -; GCN-NEXT: v_cos_bf16_e32 v0, 0x3f23 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: global_store_b16 v1, v0, s[0:1] -; GCN-NEXT: s_endpgm +; FAKE16-LABEL: cos_bf16_constant_4: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_cos_bf16_e32 v0, 0x3f23 +; FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: cos_bf16_constant_4: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_cos_bf16_e32 v0.l, 0x3f23 +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %cos = call bfloat @llvm.cos.bf16(bfloat 4.0) #0 store bfloat %cos, ptr addrspace(1) %out, align 2 ret void } define amdgpu_kernel void @cos_bf16_constant_100(ptr addrspace(1) %out) #1 { -; GCN-LABEL: cos_bf16_constant_100: -; GCN: ; %bb.0: -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv -; GCN-NEXT: v_cos_bf16_e32 v0, 0x417f -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: global_store_b16 v1, v0, s[0:1] -; GCN-NEXT: s_endpgm +; FAKE16-LABEL: cos_bf16_constant_100: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_cos_bf16_e32 v0, 0x417f +; FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: cos_bf16_constant_100: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_cos_bf16_e32 v0.l, 0x417f +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %cos = call bfloat @llvm.cos.bf16(bfloat 100.0) #0 store bfloat %cos, ptr addrspace(1) %out, align 2 ret void @@ -57,3 +93,5 @@ define amdgpu_kernel void @cos_bf16_constant_100(ptr addrspace(1) %out) #1 { attributes #0 = { nounwind readnone } attributes #1 = { nounwind } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll index 991ab5f6e38d0..e082b8da0d0f9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll @@ -1,55 +1,91 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 %s -o - | FileCheck -check-prefixes=GCN %s -; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 %s -o - | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,FAKE16 %s +; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,FAKE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,REAL16 %s +; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,REAL16 %s ; FIXME: GlobalISel does not work with bf16 declare bfloat @llvm.sin.bf16(bfloat) #0 define amdgpu_kernel void @sin_bf16(ptr addrspace(1) %out, bfloat %src) #1 { -; GCN-LABEL: sin_bf16: -; GCN: ; %bb.0: -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 nv -; GCN-NEXT: s_mov_b32 s3, 0x3e230000 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: v_fma_mixlo_bf16 v0, s2, s3, 0 op_sel_hi:[1,0,0] -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_sin_bf16_e32 v0, v0 -; GCN-NEXT: global_store_b16 v1, v0, s[0:1] -; GCN-NEXT: s_endpgm +; FAKE16-LABEL: sin_bf16: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; FAKE16-NEXT: s_mov_b32 s3, 0x3e230000 +; FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: v_fma_mixlo_bf16 v0, s2, s3, 0 op_sel_hi:[1,0,0] +; FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; FAKE16-NEXT: v_sin_bf16_e32 v0, v0 +; FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: sin_bf16: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; REAL16-NEXT: s_mov_b32 s3, 0x3e230000 +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: v_fma_mix_f32_bf16 v0, s2, s3, neg(0) op_sel_hi:[1,0,0] +; REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; REAL16-NEXT: v_sin_bf16_e32 v0.l, v0.l +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %sin = call bfloat @llvm.sin.bf16(bfloat %src) #0 store bfloat %sin, ptr addrspace(1) %out, align 2 ret void } define amdgpu_kernel void @sin_bf16_constant_4(ptr addrspace(1) %out) #1 { -; GCN-LABEL: sin_bf16_constant_4: -; GCN: ; %bb.0: -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv -; GCN-NEXT: v_sin_bf16_e32 v0, 0x3f23 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: global_store_b16 v1, v0, s[0:1] -; GCN-NEXT: s_endpgm +; FAKE16-LABEL: sin_bf16_constant_4: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_sin_bf16_e32 v0, 0x3f23 +; FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: sin_bf16_constant_4: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_sin_bf16_e32 v0.l, 0x3f23 +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %sin = call bfloat @llvm.sin.bf16(bfloat 4.0) #0 store bfloat %sin, ptr addrspace(1) %out, align 2 ret void } define amdgpu_kernel void @sin_bf16_constant_100(ptr addrspace(1) %out) #1 { -; GCN-LABEL: sin_bf16_constant_100: -; GCN: ; %bb.0: -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv -; GCN-NEXT: v_sin_bf16_e32 v0, 0x417f -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: global_store_b16 v1, v0, s[0:1] -; GCN-NEXT: s_endpgm +; FAKE16-LABEL: sin_bf16_constant_100: +; FAKE16: ; %bb.0: +; FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; FAKE16-NEXT: v_sin_bf16_e32 v0, 0x417f +; FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; FAKE16-NEXT: s_wait_kmcnt 0x0 +; FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; FAKE16-NEXT: s_endpgm +; +; REAL16-LABEL: sin_bf16_constant_100: +; REAL16: ; %bb.0: +; REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; REAL16-NEXT: v_sin_bf16_e32 v0.l, 0x417f +; REAL16-NEXT: v_mov_b32_e32 v1, 0 +; REAL16-NEXT: s_wait_kmcnt 0x0 +; REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; REAL16-NEXT: s_endpgm %sin = call bfloat @llvm.sin.bf16(bfloat 100.0) #0 store bfloat %sin, ptr addrspace(1) %out, align 2 ret void @@ -57,3 +93,5 @@ define amdgpu_kernel void @sin_bf16_constant_100(ptr addrspace(1) %out) #1 { attributes #0 = { nounwind readnone } attributes #1 = { nounwind } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 04350ee8d17e1..4b9fdf3a768dc 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -3,7 +3,8 @@ ; RUN: llc -mtriple=amdgcn-- -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -mtriple=r600-- -mcpu=cypress < %s | FileCheck -check-prefix=EG %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_load_i1: @@ -160,16 +161,27 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: global_store_b8 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm ; -; GFX1250-LABEL: constant_load_v2i1: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_u8 v1, v0, s[2:3] nv -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b8 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: constant_load_v2i1: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: global_load_u8 v1, v0, s[2:3] nv +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: global_store_b8 v0, v1, s[0:1] +; GFX1250-FAKE16-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: constant_load_v2i1: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX1250-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: global_load_u8 v0, v1, s[2:3] nv +; GFX1250-REAL16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-REAL16-NEXT: global_store_b8 v1, v0, s[0:1] +; GFX1250-REAL16-NEXT: s_endpgm %load = load <2 x i1>, ptr addrspace(4) %in store <2 x i1> %load, ptr addrspace(1) %out ret void @@ -241,16 +253,27 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: global_store_b8 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm ; -; GFX1250-LABEL: constant_load_v3i1: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_u8 v1, v0, s[2:3] nv -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b8 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: constant_load_v3i1: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: global_load_u8 v1, v0, s[2:3] nv +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: global_store_b8 v0, v1, s[0:1] +; GFX1250-FAKE16-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: constant_load_v3i1: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX1250-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: global_load_u8 v0, v1, s[2:3] nv +; GFX1250-REAL16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-REAL16-NEXT: global_store_b8 v1, v0, s[0:1] +; GFX1250-REAL16-NEXT: s_endpgm %load = load <3 x i1>, ptr addrspace(4) %in store <3 x i1> %load, ptr addrspace(1) %out ret void @@ -323,16 +346,27 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: global_store_b8 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm ; -; GFX1250-LABEL: constant_load_v4i1: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_u8 v1, v0, s[2:3] nv -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b8 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: constant_load_v4i1: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: global_load_u8 v1, v0, s[2:3] nv +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: global_store_b8 v0, v1, s[0:1] +; GFX1250-FAKE16-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: constant_load_v4i1: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX1250-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: global_load_u8 v0, v1, s[2:3] nv +; GFX1250-REAL16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-REAL16-NEXT: global_store_b8 v1, v0, s[0:1] +; GFX1250-REAL16-NEXT: s_endpgm %load = load <4 x i1>, ptr addrspace(4) %in store <4 x i1> %load, ptr addrspace(1) %out ret void @@ -405,16 +439,27 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa ; GFX12-NEXT: global_store_b8 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm ; -; GFX1250-LABEL: constant_load_v8i1: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_u8 v1, v0, s[2:3] nv -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b8 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: constant_load_v8i1: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: global_load_u8 v1, v0, s[2:3] nv +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: global_store_b8 v0, v1, s[0:1] +; GFX1250-FAKE16-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: constant_load_v8i1: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX1250-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: global_load_u8 v0, v1, s[2:3] nv +; GFX1250-REAL16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-REAL16-NEXT: global_store_b8 v1, v0, s[0:1] +; GFX1250-REAL16-NEXT: s_endpgm %load = load <8 x i1>, ptr addrspace(4) %in store <8 x i1> %load, ptr addrspace(1) %out ret void @@ -487,16 +532,27 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp ; GFX12-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX12-NEXT: s_endpgm ; -; GFX1250-LABEL: constant_load_v16i1: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_u16 v1, v0, s[2:3] nv -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: constant_load_v16i1: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] nv +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-FAKE16-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: constant_load_v16i1: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX1250-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: global_load_u16 v0, v1, s[2:3] nv +; GFX1250-REAL16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-REAL16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1250-REAL16-NEXT: s_endpgm %load = load <16 x i1>, ptr addrspace(4) %in store <16 x i1> %load, ptr addrspace(1) %out ret void @@ -1196,21 +1252,40 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out ; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1] ; GFX12-NEXT: s_endpgm ; -; GFX1250-LABEL: constant_zextload_v3i1_to_v3i32: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv -; GFX1250-NEXT: v_mov_b32_e32 v3, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_u8 v1, v3, s[2:3] nv -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_dual_lshrrev_b32 v2, 2, v0 :: v_dual_bitop2_b32 v0, 1, v1 bitop3:0x40 -; GFX1250-NEXT: v_bfe_u32 v1, v1, 1, 1 -; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX1250-NEXT: global_store_b96 v3, v[0:2], s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: constant_zextload_v3i1_to_v3i32: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, 0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: global_load_u8 v1, v3, s[2:3] nv +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-FAKE16-NEXT: v_dual_lshrrev_b32 v2, 2, v0 :: v_dual_bitop2_b32 v0, 1, v1 bitop3:0x40 +; GFX1250-FAKE16-NEXT: v_bfe_u32 v1, v1, 1, 1 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX1250-FAKE16-NEXT: global_store_b96 v3, v[0:2], s[0:1] +; GFX1250-FAKE16-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: constant_zextload_v3i1_to_v3i32: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX1250-REAL16-NEXT: v_mov_b32_e32 v3, 0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: global_load_u8 v0, v3, s[2:3] nv +; GFX1250-REAL16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-REAL16-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1250-REAL16-NEXT: s_and_b32 s3, 0xffff, s2 +; GFX1250-REAL16-NEXT: s_and_b32 s4, s2, 1 +; GFX1250-REAL16-NEXT: s_lshr_b32 s3, s3, 2 +; GFX1250-REAL16-NEXT: s_bfe_u32 s2, s2, 0x10001 +; GFX1250-REAL16-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX1250-REAL16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2 +; GFX1250-REAL16-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-REAL16-NEXT: global_store_b96 v3, v[0:2], s[0:1] +; GFX1250-REAL16-NEXT: s_endpgm %load = load <3 x i1>, ptr addrspace(4) %in %ext = zext <3 x i1> %load to <3 x i32> store <3 x i32> %ext, ptr addrspace(1) %out @@ -1392,22 +1467,41 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX12-NEXT: s_endpgm ; -; GFX1250-LABEL: constant_zextload_v4i1_to_v4i32: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv -; GFX1250-NEXT: v_mov_b32_e32 v4, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_u8 v1, v4, s[2:3] nv -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff, v1 -; GFX1250-NEXT: v_and_b32_e32 v0, 1, v1 -; GFX1250-NEXT: v_bfe_u32 v2, v1, 2, 1 -; GFX1250-NEXT: v_bfe_u32 v1, v1, 1, 1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1250-NEXT: v_lshrrev_b32_e32 v3, 3, v3 -; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: constant_zextload_v4i1_to_v4i32: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v4, 0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: global_load_u8 v1, v4, s[2:3] nv +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v0, 1, v1 +; GFX1250-FAKE16-NEXT: v_bfe_u32 v2, v1, 2, 1 +; GFX1250-FAKE16-NEXT: v_bfe_u32 v1, v1, 1, 1 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 3, v3 +; GFX1250-FAKE16-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX1250-FAKE16-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: constant_zextload_v4i1_to_v4i32: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX1250-REAL16-NEXT: v_mov_b32_e32 v4, 0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: global_load_u8 v0, v4, s[2:3] nv +; GFX1250-REAL16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-REAL16-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1250-REAL16-NEXT: s_and_b32 s3, 0xffff, s2 +; GFX1250-REAL16-NEXT: s_and_b32 s4, s2, 1 +; GFX1250-REAL16-NEXT: s_bfe_u32 s5, s2, 0x10002 +; GFX1250-REAL16-NEXT: s_bfe_u32 s2, s2, 0x10001 +; GFX1250-REAL16-NEXT: s_lshr_b32 s3, s3, 3 +; GFX1250-REAL16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2 +; GFX1250-REAL16-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s3 +; GFX1250-REAL16-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX1250-REAL16-NEXT: s_endpgm %load = load <4 x i1>, ptr addrspace(4) %in %ext = zext <4 x i1> %load to <4 x i32> store <4 x i32> %ext, ptr addrspace(1) %out @@ -1637,31 +1731,58 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out ; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] ; GFX12-NEXT: s_endpgm ; -; GFX1250-LABEL: constant_zextload_v8i1_to_v8i32: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv -; GFX1250-NEXT: v_mov_b32_e32 v8, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_u8 v0, v8, s[2:3] nv -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10003 -; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10001 -; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10005 -; GFX1250-NEXT: s_and_b32 s6, s2, 1 -; GFX1250-NEXT: s_bfe_u32 s7, s2, 0x10002 -; GFX1250-NEXT: s_bfe_u32 s2, s2, 0x10004 -; GFX1250-NEXT: v_lshrrev_b32_e32 v3, 7, v0 -; GFX1250-NEXT: v_bfe_u32 v2, v0, 6, 1 -; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s5 -; GFX1250-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s4 -; GFX1250-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s3 -; GFX1250-NEXT: s_clause 0x1 -; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 -; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: constant_zextload_v8i1_to_v8i32: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v8, 0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: global_load_u8 v0, v8, s[2:3] nv +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-FAKE16-NEXT: s_bfe_u32 s3, s2, 0x10003 +; GFX1250-FAKE16-NEXT: s_bfe_u32 s4, s2, 0x10001 +; GFX1250-FAKE16-NEXT: s_bfe_u32 s5, s2, 0x10005 +; GFX1250-FAKE16-NEXT: s_and_b32 s6, s2, 1 +; GFX1250-FAKE16-NEXT: s_bfe_u32 s7, s2, 0x10002 +; GFX1250-FAKE16-NEXT: s_bfe_u32 s2, s2, 0x10004 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 7, v0 +; GFX1250-FAKE16-NEXT: v_bfe_u32 v2, v0, 6, 1 +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s5 +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s4 +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s3 +; GFX1250-FAKE16-NEXT: s_clause 0x1 +; GFX1250-FAKE16-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 +; GFX1250-FAKE16-NEXT: global_store_b128 v8, v[4:7], s[0:1] +; GFX1250-FAKE16-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: constant_zextload_v8i1_to_v8i32: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX1250-REAL16-NEXT: v_mov_b32_e32 v8, 0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: global_load_u8 v0, v8, s[2:3] nv +; GFX1250-REAL16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-REAL16-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1250-REAL16-NEXT: s_and_b32 s5, 0xffff, s2 +; GFX1250-REAL16-NEXT: s_bfe_u32 s3, s2, 0x10003 +; GFX1250-REAL16-NEXT: s_bfe_u32 s4, s2, 0x10001 +; GFX1250-REAL16-NEXT: s_bfe_u32 s6, s2, 0x10005 +; GFX1250-REAL16-NEXT: s_and_b32 s7, s2, 1 +; GFX1250-REAL16-NEXT: s_bfe_u32 s8, s2, 0x10002 +; GFX1250-REAL16-NEXT: s_bfe_u32 s2, s2, 0x10004 +; GFX1250-REAL16-NEXT: s_lshr_b32 s9, s5, 7 +; GFX1250-REAL16-NEXT: s_bfe_u32 s5, s5, 0x10006 +; GFX1250-REAL16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s6 +; GFX1250-REAL16-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v2, s5 +; GFX1250-REAL16-NEXT: v_dual_mov_b32 v3, s9 :: v_dual_mov_b32 v5, s4 +; GFX1250-REAL16-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v7, s3 +; GFX1250-REAL16-NEXT: s_clause 0x1 +; GFX1250-REAL16-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 +; GFX1250-REAL16-NEXT: global_store_b128 v8, v[4:7], s[0:1] +; GFX1250-REAL16-NEXT: s_endpgm %load = load <8 x i1>, ptr addrspace(4) %in %ext = zext <8 x i1> %load to <8 x i32> store <8 x i32> %ext, ptr addrspace(1) %out @@ -5340,23 +5461,43 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] ; GFX12-NEXT: s_endpgm ; -; GFX1250-LABEL: constant_zextload_v2i1_to_v2i64: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv -; GFX1250-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_u8 v0, v1, s[2:3] nv -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX1250-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_bitop2_b32 v0, 1, v0 bitop3:0x40 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: constant_zextload_v2i1_to_v2i64: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: global_load_u8 v0, v1, s[2:3] nv +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_bitop2_b32 v0, 1, v0 bitop3:0x40 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX1250-FAKE16-NEXT: global_store_b128 v1, v[0:3], s[0:1] +; GFX1250-FAKE16-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: constant_zextload_v2i1_to_v2i64: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX1250-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: global_load_u8 v0, v1, s[2:3] nv +; GFX1250-REAL16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-REAL16-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1250-REAL16-NEXT: s_and_b32 s3, 0xffff, s2 +; GFX1250-REAL16-NEXT: s_and_b32 s2, s2, 1 +; GFX1250-REAL16-NEXT: s_lshr_b32 s3, s3, 1 +; GFX1250-REAL16-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1250-REAL16-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX1250-REAL16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-REAL16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s3 +; GFX1250-REAL16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-REAL16-NEXT: global_store_b128 v1, v[0:3], s[0:1] +; GFX1250-REAL16-NEXT: s_endpgm %load = load <2 x i1>, ptr addrspace(4) %in %ext = zext <2 x i1> %load to <2 x i64> store <2 x i64> %ext, ptr addrspace(1) %out @@ -5561,27 +5702,52 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out ; GFX12-NEXT: global_store_b128 v5, v[0:3], s[0:1] ; GFX12-NEXT: s_endpgm ; -; GFX1250-LABEL: constant_zextload_v3i1_to_v3i64: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv -; GFX1250-NEXT: v_mov_b32_e32 v5, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_u8 v0, v5, s[2:3] nv -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX1250-NEXT: v_bfe_u32 v2, v0, 1, 1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_dual_lshrrev_b32 v4, 2, v1 :: v_dual_bitop2_b32 v0, 1, v0 bitop3:0x40 -; GFX1250-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v3, v5 -; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1250-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX1250-NEXT: s_clause 0x1 -; GFX1250-NEXT: global_store_b64 v5, v[4:5], s[0:1] offset:16 -; GFX1250-NEXT: global_store_b128 v5, v[0:3], s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: constant_zextload_v3i1_to_v3i64: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, 0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: global_load_u8 v0, v5, s[2:3] nv +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX1250-FAKE16-NEXT: v_bfe_u32 v2, v0, 1, 1 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1250-FAKE16-NEXT: v_dual_lshrrev_b32 v4, 2, v1 :: v_dual_bitop2_b32 v0, 1, v0 bitop3:0x40 +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v3, v5 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX1250-FAKE16-NEXT: s_clause 0x1 +; GFX1250-FAKE16-NEXT: global_store_b64 v5, v[4:5], s[0:1] offset:16 +; GFX1250-FAKE16-NEXT: global_store_b128 v5, v[0:3], s[0:1] +; GFX1250-FAKE16-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: constant_zextload_v3i1_to_v3i64: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX1250-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: global_load_u8 v0, v1, s[2:3] nv +; GFX1250-REAL16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-REAL16-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1250-REAL16-NEXT: s_and_b32 s3, 0xffff, s2 +; GFX1250-REAL16-NEXT: s_bfe_u32 s4, s2, 0x10001 +; GFX1250-REAL16-NEXT: s_lshr_b32 s3, s3, 2 +; GFX1250-REAL16-NEXT: s_and_b32 s2, s2, 1 +; GFX1250-REAL16-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX1250-REAL16-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1250-REAL16-NEXT: v_mov_b32_e32 v0, s3 +; GFX1250-REAL16-NEXT: s_and_b32 s3, 0xffff, s4 +; GFX1250-REAL16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-REAL16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, s3 +; GFX1250-REAL16-NEXT: global_store_b64 v1, v[0:1], s[0:1] offset:16 +; GFX1250-REAL16-NEXT: s_wait_xcnt 0x0 +; GFX1250-REAL16-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-REAL16-NEXT: global_store_b128 v1, v[0:3], s[0:1] +; GFX1250-REAL16-NEXT: s_endpgm %load = load <3 x i1>, ptr addrspace(4) %in %ext = zext <3 x i1> %load to <3 x i64> store <3 x i64> %ext, ptr addrspace(1) %out @@ -5832,32 +5998,58 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out ; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] ; GFX12-NEXT: s_endpgm ; -; GFX1250-LABEL: constant_zextload_v4i1_to_v4i64: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv -; GFX1250-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_u8 v0, v1, s[2:3] nv -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10002 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: v_lshrrev_b32_e32 v2, 3, v0 -; GFX1250-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX1250-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, s3 -; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10001 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX1250-NEXT: s_and_b32 s2, s2, 1 -; GFX1250-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX1250-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s3 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: constant_zextload_v4i1_to_v4i64: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: global_load_u8 v0, v1, s[2:3] nv +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-FAKE16-NEXT: s_bfe_u32 s3, s2, 0x10002 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 3, v0 +; GFX1250-FAKE16-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, s3 +; GFX1250-FAKE16-NEXT: s_bfe_u32 s3, s2, 0x10001 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX1250-FAKE16-NEXT: s_and_b32 s2, s2, 1 +; GFX1250-FAKE16-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX1250-FAKE16-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1250-FAKE16-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s3 +; GFX1250-FAKE16-NEXT: global_store_b128 v1, v[0:3], s[0:1] +; GFX1250-FAKE16-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: constant_zextload_v4i1_to_v4i64: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX1250-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: global_load_u8 v0, v1, s[2:3] nv +; GFX1250-REAL16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-REAL16-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1250-REAL16-NEXT: s_and_b32 s3, 0xffff, s2 +; GFX1250-REAL16-NEXT: s_bfe_u32 s4, s2, 0x10002 +; GFX1250-REAL16-NEXT: s_lshr_b32 s3, s3, 3 +; GFX1250-REAL16-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX1250-REAL16-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX1250-REAL16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, v1 +; GFX1250-REAL16-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-REAL16-NEXT: s_bfe_u32 s3, s2, 0x10001 +; GFX1250-REAL16-NEXT: s_and_b32 s2, s2, 1 +; GFX1250-REAL16-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX1250-REAL16-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX1250-REAL16-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 +; GFX1250-REAL16-NEXT: s_wait_xcnt 0x0 +; GFX1250-REAL16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s3 +; GFX1250-REAL16-NEXT: global_store_b128 v1, v[0:3], s[0:1] +; GFX1250-REAL16-NEXT: s_endpgm %load = load <4 x i1>, ptr addrspace(4) %in %ext = zext <4 x i1> %load to <4 x i64> store <4 x i64> %ext, ptr addrspace(1) %out @@ -6732,49 +6924,103 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX12-NEXT: global_store_b128 v3, v[2:5], s[0:1] ; GFX12-NEXT: s_endpgm ; -; GFX1250-LABEL: constant_zextload_v16i1_to_v16i64: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv -; GFX1250-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_u16 v12, v1, s[2:3] nv -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_and_b32_e32 v22, 0xffff, v12 -; GFX1250-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_bitop2_b32 v28, 1, v12 bitop3:0x40 -; GFX1250-NEXT: v_mov_b32_e32 v5, v1 -; GFX1250-NEXT: v_bfe_u32 v0, v12, 10, 1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX1250-NEXT: v_bfe_u32 v2, v22, 11, 1 -; GFX1250-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v9, v1 -; GFX1250-NEXT: v_bfe_u32 v6, v12, 9, 1 -; GFX1250-NEXT: v_bfe_u32 v4, v22, 8, 1 -; GFX1250-NEXT: v_dual_mov_b32 v11, v1 :: v_dual_mov_b32 v13, v1 -; GFX1250-NEXT: v_dual_mov_b32 v31, v1 :: v_dual_lshrrev_b32 v10, 15, v22 -; GFX1250-NEXT: v_bfe_u32 v8, v22, 14, 1 -; GFX1250-NEXT: v_dual_mov_b32 v15, v1 :: v_dual_mov_b32 v17, v1 -; GFX1250-NEXT: v_bfe_u32 v14, v12, 13, 1 -; GFX1250-NEXT: v_bfe_u32 v18, v12, 7, 1 -; GFX1250-NEXT: v_bfe_u32 v26, v12, 3, 1 -; GFX1250-NEXT: v_bfe_u32 v30, v12, 1, 1 -; GFX1250-NEXT: v_bfe_u32 v24, v12, 2, 1 -; GFX1250-NEXT: v_bfe_u32 v20, v12, 4, 1 -; GFX1250-NEXT: v_bfe_u32 v16, v12, 6, 1 -; GFX1250-NEXT: v_bfe_u32 v12, v12, 12, 1 -; GFX1250-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v21, v1 -; GFX1250-NEXT: v_dual_mov_b32 v23, v1 :: v_dual_mov_b32 v25, v1 -; GFX1250-NEXT: v_dual_mov_b32 v27, v1 :: v_dual_mov_b32 v29, v1 -; GFX1250-NEXT: v_bfe_u32 v22, v22, 5, 1 -; GFX1250-NEXT: s_clause 0x7 -; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80 -; GFX1250-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:64 -; GFX1250-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:112 -; GFX1250-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:96 -; GFX1250-NEXT: global_store_b128 v1, v[16:19], s[0:1] offset:48 -; GFX1250-NEXT: global_store_b128 v1, v[20:23], s[0:1] offset:32 -; GFX1250-NEXT: global_store_b128 v1, v[24:27], s[0:1] offset:16 -; GFX1250-NEXT: global_store_b128 v1, v[28:31], s[0:1] -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: constant_zextload_v16i1_to_v16i64: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: global_load_u16 v12, v1, s[2:3] nv +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff, v12 +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_bitop2_b32 v28, 1, v12 bitop3:0x40 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v1 +; GFX1250-FAKE16-NEXT: v_bfe_u32 v0, v12, 10, 1 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1250-FAKE16-NEXT: v_bfe_u32 v2, v22, 11, 1 +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v9, v1 +; GFX1250-FAKE16-NEXT: v_bfe_u32 v6, v12, 9, 1 +; GFX1250-FAKE16-NEXT: v_bfe_u32 v4, v22, 8, 1 +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v11, v1 :: v_dual_mov_b32 v13, v1 +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v31, v1 :: v_dual_lshrrev_b32 v10, 15, v22 +; GFX1250-FAKE16-NEXT: v_bfe_u32 v8, v22, 14, 1 +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v15, v1 :: v_dual_mov_b32 v17, v1 +; GFX1250-FAKE16-NEXT: v_bfe_u32 v14, v12, 13, 1 +; GFX1250-FAKE16-NEXT: v_bfe_u32 v18, v12, 7, 1 +; GFX1250-FAKE16-NEXT: v_bfe_u32 v26, v12, 3, 1 +; GFX1250-FAKE16-NEXT: v_bfe_u32 v30, v12, 1, 1 +; GFX1250-FAKE16-NEXT: v_bfe_u32 v24, v12, 2, 1 +; GFX1250-FAKE16-NEXT: v_bfe_u32 v20, v12, 4, 1 +; GFX1250-FAKE16-NEXT: v_bfe_u32 v16, v12, 6, 1 +; GFX1250-FAKE16-NEXT: v_bfe_u32 v12, v12, 12, 1 +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v21, v1 +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v23, v1 :: v_dual_mov_b32 v25, v1 +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v27, v1 :: v_dual_mov_b32 v29, v1 +; GFX1250-FAKE16-NEXT: v_bfe_u32 v22, v22, 5, 1 +; GFX1250-FAKE16-NEXT: s_clause 0x7 +; GFX1250-FAKE16-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80 +; GFX1250-FAKE16-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:64 +; GFX1250-FAKE16-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:112 +; GFX1250-FAKE16-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:96 +; GFX1250-FAKE16-NEXT: global_store_b128 v1, v[16:19], s[0:1] offset:48 +; GFX1250-FAKE16-NEXT: global_store_b128 v1, v[20:23], s[0:1] offset:32 +; GFX1250-FAKE16-NEXT: global_store_b128 v1, v[24:27], s[0:1] offset:16 +; GFX1250-FAKE16-NEXT: global_store_b128 v1, v[28:31], s[0:1] +; GFX1250-FAKE16-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: constant_zextload_v16i1_to_v16i64: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX1250-REAL16-NEXT: v_mov_b32_e32 v3, 0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: global_load_u16 v0, v3, s[2:3] nv +; GFX1250-REAL16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-REAL16-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1250-REAL16-NEXT: s_and_b32 s3, 0xffff, s2 +; GFX1250-REAL16-NEXT: s_bfe_u32 s4, s2, 0x1000a +; GFX1250-REAL16-NEXT: s_bfe_u32 s5, s3, 0x1000b +; GFX1250-REAL16-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v5, v3 +; GFX1250-REAL16-NEXT: v_mov_b32_e32 v4, s5 +; GFX1250-REAL16-NEXT: s_bfe_u32 s4, s2, 0x10009 +; GFX1250-REAL16-NEXT: s_bfe_u32 s5, s3, 0x10008 +; GFX1250-REAL16-NEXT: v_mov_b32_e32 v1, v3 +; GFX1250-REAL16-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:80 +; GFX1250-REAL16-NEXT: s_wait_xcnt 0x0 +; GFX1250-REAL16-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 +; GFX1250-REAL16-NEXT: s_lshr_b32 s4, s3, 15 +; GFX1250-REAL16-NEXT: s_bfe_u32 s5, s3, 0x1000e +; GFX1250-REAL16-NEXT: s_bfe_u32 s3, s3, 0x10005 +; GFX1250-REAL16-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:64 +; GFX1250-REAL16-NEXT: s_wait_xcnt 0x0 +; GFX1250-REAL16-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 +; GFX1250-REAL16-NEXT: s_bfe_u32 s4, s2, 0x1000d +; GFX1250-REAL16-NEXT: s_bfe_u32 s5, s2, 0x1000c +; GFX1250-REAL16-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:112 +; GFX1250-REAL16-NEXT: s_wait_xcnt 0x0 +; GFX1250-REAL16-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 +; GFX1250-REAL16-NEXT: s_bfe_u32 s4, s2, 0x10007 +; GFX1250-REAL16-NEXT: s_bfe_u32 s5, s2, 0x10006 +; GFX1250-REAL16-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:96 +; GFX1250-REAL16-NEXT: s_wait_xcnt 0x0 +; GFX1250-REAL16-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v4, s4 +; GFX1250-REAL16-NEXT: s_bfe_u32 s4, s2, 0x10004 +; GFX1250-REAL16-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:48 +; GFX1250-REAL16-NEXT: s_wait_xcnt 0x0 +; GFX1250-REAL16-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v4, s3 +; GFX1250-REAL16-NEXT: s_bfe_u32 s3, s2, 0x10003 +; GFX1250-REAL16-NEXT: s_bfe_u32 s4, s2, 0x10002 +; GFX1250-REAL16-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-REAL16-NEXT: s_bfe_u32 s2, s2, 0x10001 +; GFX1250-REAL16-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:32 +; GFX1250-REAL16-NEXT: s_wait_xcnt 0x0 +; GFX1250-REAL16-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v4, s3 +; GFX1250-REAL16-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1250-REAL16-NEXT: global_store_b128 v3, v[2:5], s[0:1] offset:16 +; GFX1250-REAL16-NEXT: s_wait_xcnt 0x0 +; GFX1250-REAL16-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-REAL16-NEXT: global_store_b128 v3, v[0:3], s[0:1] +; GFX1250-REAL16-NEXT: s_endpgm %load = load <16 x i1>, ptr addrspace(4) %in %ext = zext <16 x i1> %load to <16 x i64> store <16 x i64> %ext, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll index 777dde940d194..6826ab50e7466 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s define float @v_mad_mix_f32_bf16lo_bf16lo_bf16lo(bfloat %src0, bfloat %src1, bfloat %src2) #0 { ; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_bf16lo: @@ -464,15 +465,25 @@ define float @v_mad_mix_f32_negprecvtbf16lo_bf16lo_bf16lo(i32 %src0.arg, bfloat } define float @v_mad_mix_f32_precvtnegbf16hi_abs_bf16lo_bf16lo(i32 %src0.arg, bfloat %src1, bfloat %src2) #0 { -; GFX1250-LABEL: v_mad_mix_f32_precvtnegbf16hi_abs_bf16lo_bf16lo: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, |v0|, v1, v2 op_sel_hi:[1,1,1] -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mad_mix_f32_precvtnegbf16hi_abs_bf16lo_bf16lo: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1250-FAKE16-NEXT: v_fma_mix_f32_bf16 v0, |v0|, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mad_mix_f32_precvtnegbf16hi_abs_bf16lo_bf16lo: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX1250-REAL16-NEXT: v_xor_b16 v2.l, 0x8000, v0.h +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_fma_mix_f32_bf16 v0, |v2|, v1, v0 op_sel_hi:[1,1,1] +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat> %src0 = extractelement <2 x bfloat> %src0.arg.bc, i32 1 %src0.neg = fneg bfloat %src0 @@ -554,15 +565,29 @@ define float @v_mad_mix_f32_preextractfabsfneg_bf16hi_bf16lo_bf16lo(i32 %src0.ar } define float @v_mad_mix_f32_bf16lo_bf16lo_bf16lo_all_cast_from_half(half %src0, half %src1, half %src2) #0 { -; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_bf16lo_all_cast_from_half: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_lshlrev_b32 v3, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_fmac_f32_e32 v0, v3, v1 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mad_mix_f32_bf16lo_bf16lo_bf16lo_all_cast_from_half: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_dual_lshlrev_b32 v3, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_fmac_f32_e32 v0, v3, v1 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mad_mix_f32_bf16lo_bf16lo_bf16lo_all_cast_from_half: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v3.h, v0.l +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v4.h, v1.l +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v4.l, v3.l +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX1250-REAL16-NEXT: v_fmac_f32_e32 v0, v3, v4 +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %src0.bf16 = bitcast half %src0 to bfloat %src1.bf16 = bitcast half %src1 to bfloat %src2.bf16 = bitcast half %src2 to bfloat @@ -574,14 +599,24 @@ define float @v_mad_mix_f32_bf16lo_bf16lo_bf16lo_all_cast_from_half(half %src0, } define float @v_mad_mix_f32_bf16lo_cast_from_half_bf16lo_bf16lo(half %src0, bfloat %src1, bfloat %src2) #0 { -; GFX1250-LABEL: v_mad_mix_f32_bf16lo_cast_from_half_bf16lo_bf16lo: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[0,1,1] -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mad_mix_f32_bf16lo_cast_from_half_bf16lo_bf16lo: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[0,1,1] +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mad_mix_f32_bf16lo_cast_from_half_bf16lo_bf16lo: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v3.h, v0.l +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_fma_mix_f32_bf16 v0, v3, v1, v2 op_sel_hi:[0,1,1] +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %src0.bf16 = bitcast half %src0 to bfloat %src0.ext = fpext bfloat %src0.bf16 to float %src1.ext = fpext bfloat %src1 to float @@ -883,15 +918,25 @@ define float @v_mad_mix_f32_negabsprecvtbf16lo_add_bf16lo(i32 %src0.arg, bfloat } define float @v_mad_mix_f32_precvtnegbf16hi_abs_add_bf16lo(i32 %src0.arg, bfloat %src1) { -; GFX1250-LABEL: v_mad_mix_f32_precvtnegbf16hi_abs_add_bf16lo: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, |v0|, 1.0, v1 op_sel:[0,1,0] op_sel_hi:[1,1,1] -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mad_mix_f32_precvtnegbf16hi_abs_add_bf16lo: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1250-FAKE16-NEXT: v_fma_mix_f32_bf16 v0, |v0|, 1.0, v1 op_sel:[0,1,0] op_sel_hi:[1,1,1] +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mad_mix_f32_precvtnegbf16hi_abs_add_bf16lo: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX1250-REAL16-NEXT: v_xor_b16 v1.l, 0x8000, v0.h +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_fma_mix_f32_bf16 v0, |v1|, 1.0, v0 op_sel:[0,1,0] op_sel_hi:[1,1,1] +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat> %src0 = extractelement <2 x bfloat> %src0.arg.bc, i32 1 %src0.neg = fneg bfloat %src0 @@ -1235,15 +1280,25 @@ define float @v_mad_mix_f32_negabsprecvtbf16lo_mul_bf16lo(i32 %src0.arg, bfloat } define float @v_mad_mix_f32_precvtnegbf16hi_abs_mul_bf16lo(i32 %src0.arg, bfloat %src1) { -; GFX1250-LABEL: v_mad_mix_f32_precvtnegbf16hi_abs_mul_bf16lo: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, |v0|, v1, neg(0) op_sel_hi:[1,1,0] -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mad_mix_f32_precvtnegbf16hi_abs_mul_bf16lo: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1250-FAKE16-NEXT: v_fma_mix_f32_bf16 v0, |v0|, v1, neg(0) op_sel_hi:[1,1,0] +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mad_mix_f32_precvtnegbf16hi_abs_mul_bf16lo: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX1250-REAL16-NEXT: v_xor_b16 v1.l, 0x8000, v0.h +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_fma_mix_f32_bf16 v0, |v1|, v0, neg(0) op_sel_hi:[1,1,0] +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat> %src0 = extractelement <2 x bfloat> %src0.arg.bc, i32 1 %src0.neg = fneg bfloat %src0 diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll index 4c3f61522f5ae..3b684268dcc50 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll @@ -1,13 +1,24 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo(bfloat %src0, bfloat %src1, bfloat %src2) #0 { -; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_fma_mixhi_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_fma_mixhi_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %src0.ext = fpext bfloat %src0 to float %src1.ext = fpext bfloat %src1 to float %src2.ext = fpext bfloat %src2 to float @@ -18,15 +29,26 @@ define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo(bfloat %src0, } define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_constlo(bfloat %src0, bfloat %src1, bfloat %src2) #0 { -; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_constlo: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mov_b32_e32 v3, 0x3f80 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_fma_mixhi_bf16 v3, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX1250-NEXT: v_mov_b32_e32 v0, v3 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_constlo: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, 0x3f80 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_fma_mixhi_bf16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_constlo: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-REAL16-NEXT: v_cvt_pk_bf16_f32 v1, v0, s0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v0.l, 0x3f80 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %src0.ext = fpext bfloat %src0 to float %src1.ext = fpext bfloat %src1 to float %src2.ext = fpext bfloat %src2 to float @@ -37,14 +59,25 @@ define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_constlo(bfloat %src0, } define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_reglo(bfloat %src0, bfloat %src1, bfloat %src2, bfloat %lo) #0 { -; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_reglo: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_fma_mixhi_bf16 v3, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_mov_b32_e32 v0, v3 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_reglo: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_fma_mixhi_bf16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_reglo: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-REAL16-NEXT: v_cvt_pk_bf16_f32 v1, v0, s0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %src0.ext = fpext bfloat %src0 to float %src1.ext = fpext bfloat %src1 to float %src2.ext = fpext bfloat %src2 to float @@ -56,14 +89,25 @@ define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_reglo(bfloat %src0, b } define i32 @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_intpack(bfloat %src0, bfloat %src1, bfloat %src2) #0 { -; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_intpack: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_intpack: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_intpack: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-REAL16-NEXT: v_cvt_pk_bf16_f32 v1, v0, s0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %src0.ext = fpext bfloat %src0 to float %src1.ext = fpext bfloat %src1 to float %src2.ext = fpext bfloat %src2 to float @@ -76,14 +120,25 @@ define i32 @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_intpack(bfloat %src0, bfloat % } define i32 @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_intpack_sext(bfloat %src0, bfloat %src1, bfloat %src2) #0 { -; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_intpack_sext: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_intpack_sext: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_intpack_sext: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-REAL16-NEXT: v_cvt_pk_bf16_f32 v1, v0, s0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %src0.ext = fpext bfloat %src0 to float %src1.ext = fpext bfloat %src1 to float %src2.ext = fpext bfloat %src2 to float @@ -96,15 +151,25 @@ define i32 @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_intpack_sext(bfloat %src0, bfl } define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_precvt(bfloat %src0, bfloat %src1, bfloat %src2) #0 { -; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_precvt: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 -; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_precvt: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_precvt: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %src0.ext = fpext bfloat %src0 to float %src1.ext = fpext bfloat %src1 to float %src2.ext = fpext bfloat %src2 to float @@ -117,12 +182,24 @@ define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_precvt( } define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt(bfloat %src0, bfloat %src1, bfloat %src2) #0 { -; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_fma_mixhi_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_fma_mixhi_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-REAL16-NEXT: v_pk_max_num_bf16 v0, v0, v0 op_sel_hi:[0,0] clamp +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %src0.ext = fpext bfloat %src0 to float %src1.ext = fpext bfloat %src1 to float %src2.ext = fpext bfloat %src2 to float @@ -135,15 +212,30 @@ define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt } define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt_multi_use(bfloat %src0, bfloat %src1, bfloat %src2) #0 { -; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt_multi_use: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_fma_mixlo_bf16 v3, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX1250-NEXT: v_fma_mixhi_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX1250-NEXT: global_store_b16 v[0:1], v3, off scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_storecnt 0x0 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt_multi_use: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_fma_mixlo_bf16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-FAKE16-NEXT: v_fma_mixhi_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GFX1250-FAKE16-NEXT: global_store_b16 v[0:1], v3, off scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt_multi_use: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_pk_max_num_bf16 v1, v1, v1 op_sel_hi:[0,0] clamp +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX1250-REAL16-NEXT: global_store_b16 v[0:1], v0, off scope:SCOPE_SYS +; GFX1250-REAL16-NEXT: s_wait_storecnt 0x0 +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %src0.ext = fpext bfloat %src0 to float %src1.ext = fpext bfloat %src1 to float %src2.ext = fpext bfloat %src2 to float @@ -165,3 +257,5 @@ declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) # attributes #0 = { nounwind denormal_fpenv(float: preservesign) } attributes #1 = { nounwind readnone speculatable } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX1250: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll index a2f6b5f7cd073..8eb7ccf6d5ab1 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll @@ -1,37 +1,65 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s define bfloat @mixlo_simple(float %src0, float %src1, float %src2) #0 { -; GFX1250-LABEL: mixlo_simple: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: mixlo_simple: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: mixlo_simple: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v2, s0 +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2) %cvt.result = fptrunc float %result to bfloat ret bfloat %cvt.result } define bfloat @mixlo_simpl_no_flush(float %src0, float %src1, float %src2) { -; GFX1250-LABEL: mixlo_simpl_no_flush: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: mixlo_simpl_no_flush: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: mixlo_simpl_no_flush: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v2, s0 +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2) %cvt.result = fptrunc float %result to bfloat ret bfloat %cvt.result } define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_bf16lo(bfloat %src0, bfloat %src1, bfloat %src2) #0 { -; GFX1250-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_bf16lo: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_bf16lo: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_bf16lo: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %src0.ext = fpext bfloat %src0 to float %src1.ext = fpext bfloat %src1 to float %src2.ext = fpext bfloat %src2 to float @@ -41,12 +69,21 @@ define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_bf16lo(bfloat %src0, bfloat %src1, } define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_bf16lo_no_flush(bfloat %src0, bfloat %src1, bfloat %src2) { -; GFX1250-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_bf16lo_no_flush: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_bf16lo_no_flush: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_bf16lo_no_flush: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %src0.ext = fpext bfloat %src0 to float %src1.ext = fpext bfloat %src1 to float %src2.ext = fpext bfloat %src2 to float @@ -56,12 +93,21 @@ define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_bf16lo_no_flush(bfloat %src0, bflo } define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_f32(bfloat %src0, bfloat %src1, float %src2) #0 { -; GFX1250-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_f32: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0] -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_f32: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_f32: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %src0.ext = fpext bfloat %src0 to float %src1.ext = fpext bfloat %src1 to float %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) @@ -70,12 +116,22 @@ define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_f32(bfloat %src0, bfloat %src1, fl } define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_post_cvt(bfloat %src0, bfloat %src1, float %src2) #0 { -; GFX1250-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_post_cvt: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_post_cvt: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_post_cvt: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-REAL16-NEXT: v_pk_max_num_bf16 v0, v0, v0 op_sel_hi:[0,0] clamp +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %src0.ext = fpext bfloat %src0 to float %src1.ext = fpext bfloat %src1 to float %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) @@ -126,16 +182,28 @@ define <2 x bfloat> @v_mad_mix_v2f32(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 } define <3 x bfloat> @v_mad_mix_v3f32(<3 x bfloat> %src0, <3 x bfloat> %src1, <3 x bfloat> %src2) #0 { -; GFX1250-LABEL: v_mad_mix_v3f32: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_fma_mixlo_bf16 v6, v0, v2, v4 op_sel_hi:[1,1,1] -; GFX1250-NEXT: v_fma_mixlo_bf16 v1, v1, v3, v5 op_sel_hi:[1,1,1] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_fma_mixhi_bf16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX1250-NEXT: v_mov_b32_e32 v0, v6 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mad_mix_v3f32: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_fma_mixlo_bf16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GFX1250-FAKE16-NEXT: v_fma_mixlo_bf16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_fma_mixhi_bf16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v0, v6 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mad_mix_v3f32: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_fma_mix_f32_bf16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; GFX1250-REAL16-NEXT: v_fma_mix_f32_bf16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX1250-REAL16-NEXT: v_fma_mix_f32_bf16 v0, v0, v2, v4 op_sel_hi:[1,1,1] +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-REAL16-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %src0.ext = fpext <3 x bfloat> %src0 to <3 x float> %src1.ext = fpext <3 x bfloat> %src1 to <3 x float> %src2.ext = fpext <3 x bfloat> %src2 to <3 x float> @@ -200,18 +268,32 @@ define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt(<2 x bfloat> %src0, <2 x bflo } define <3 x bfloat> @v_mad_mix_v3f32_clamp_postcvt(<3 x bfloat> %src0, <3 x bfloat> %src1, <3 x bfloat> %src2) #0 { -; GFX1250-LABEL: v_mad_mix_v3f32_clamp_postcvt: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_fma_mixlo_bf16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX1250-NEXT: v_fma_mixlo_bf16 v1, v1, v3, v5 op_sel_hi:[1,1,1] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_fma_mixhi_bf16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v1 clamp -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1250-NEXT: v_mov_b32_e32 v0, v6 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mad_mix_v3f32_clamp_postcvt: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_fma_mixlo_bf16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX1250-FAKE16-NEXT: v_fma_mixlo_bf16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-FAKE16-NEXT: v_fma_mixhi_bf16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX1250-FAKE16-NEXT: v_pk_max_num_bf16 v1, v1, v1 clamp +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v0, v6 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mad_mix_v3f32_clamp_postcvt: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_fma_mix_f32_bf16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; GFX1250-REAL16-NEXT: v_fma_mix_f32_bf16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX1250-REAL16-NEXT: v_fma_mix_f32_bf16 v0, v0, v2, v4 op_sel_hi:[1,1,1] +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-REAL16-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 clamp +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-REAL16-NEXT: v_pk_max_num_bf16 v1, v1, v1 clamp +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %src0.ext = fpext <3 x bfloat> %src0 to <3 x float> %src1.ext = fpext <3 x bfloat> %src1 to <3 x float> %src2.ext = fpext <3 x bfloat> %src2 to <3 x float> @@ -254,22 +336,39 @@ define <4 x bfloat> @v_mad_mix_v4f32_clamp_postcvt(<4 x bfloat> %src0, <4 x bflo } define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 { -; GFX1250-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX1250-NEXT: v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1 -; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1] -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_pk_max_num_bf16 v1, v0, v0 clamp -; GFX1250-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX1250-FAKE16-NEXT: v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1] +; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_pk_max_num_bf16 v1, v0, v0 clamp +; GFX1250-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX1250-REAL16-NEXT: v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1 +; GFX1250-REAL16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX1250-REAL16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX1250-REAL16-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1] +; GFX1250-REAL16-NEXT: v_cvt_pk_bf16_f32 v1, v0, v1 +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_pk_max_num_bf16 v0, v1, v1 op_sel_hi:[0,0] clamp +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v0.h, v1.h +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %src0.ext = fpext <2 x bfloat> %src0 to <2 x float> %src1.ext = fpext <2 x bfloat> %src1 to <2 x float> %src2.ext = fpext <2 x bfloat> %src2 to <2 x float> @@ -283,24 +382,43 @@ define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x bfloat> %src0, <2 x b } define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 { -; GFX1250-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX1250-NEXT: v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1 -; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1] -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v1 clamp -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX1250-FAKE16-NEXT: v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1] +; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX1250-FAKE16-NEXT: v_pk_max_num_bf16 v1, v1, v1 clamp +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX1250-REAL16-NEXT: v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1 +; GFX1250-REAL16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX1250-REAL16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX1250-REAL16-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1] +; GFX1250-REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v1.l, v0.h +; GFX1250-REAL16-NEXT: v_pk_max_num_bf16 v1, v1, v1 op_sel_hi:[0,0] clamp +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %src0.ext = fpext <2 x bfloat> %src0 to <2 x float> %src1.ext = fpext <2 x bfloat> %src1 to <2 x float> %src2.ext = fpext <2 x bfloat> %src2 to <2 x float> @@ -341,17 +459,29 @@ define <2 x bfloat> @v_mad_mix_v2f32_clamp_precvt(<2 x bfloat> %src0, <2 x bfloa } define <3 x bfloat> @v_mad_mix_v3f32_clamp_precvt(<3 x bfloat> %src0, <3 x bfloat> %src1, <3 x bfloat> %src2) #0 { -; GFX1250-LABEL: v_mad_mix_v3f32_clamp_precvt: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_fma_mix_f32_bf16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX1250-NEXT: v_fma_mix_f32_bf16 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0 -; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mad_mix_v3f32_clamp_precvt: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_fma_mix_f32_bf16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX1250-FAKE16-NEXT: v_fma_mix_f32_bf16 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX1250-FAKE16-NEXT: v_fma_mix_f32_bf16 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0 +; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mad_mix_v3f32_clamp_precvt: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_fma_mix_f32_bf16 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GFX1250-REAL16-NEXT: v_fma_mix_f32_bf16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX1250-REAL16-NEXT: v_fma_mix_f32_bf16 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-REAL16-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v3, v0 +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %src0.ext = fpext <3 x bfloat> %src0 to <3 x float> %src1.ext = fpext <3 x bfloat> %src1 to <3 x float> %src2.ext = fpext <3 x bfloat> %src2 to <3 x float> @@ -400,14 +530,25 @@ define <4 x bfloat> @v_mad_mix_v4f32_clamp_precvt(<4 x bfloat> %src0, <4 x bfloa } define i32 @mixlo_zext(float %src0, float %src1, float %src2) #0 { -; GFX1250-LABEL: mixlo_zext: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: mixlo_zext: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: mixlo_zext: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_cvt_pk_bf16_f32 v1, v2, s0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2) %cvt.result = fptrunc float %result to bfloat %cvt.result.i16 = bitcast bfloat %cvt.result to i16 @@ -416,12 +557,21 @@ define i32 @mixlo_zext(float %src0, float %src1, float %src2) #0 { } define bfloat @mixlo_fptrunc(float %a, float %b) #0 { -; GFX1250-LABEL: mixlo_fptrunc: -; GFX1250: ; %bb.0: ; %.entry -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, 0 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: mixlo_fptrunc: +; GFX1250-FAKE16: ; %bb.0: ; %.entry +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_fma_mixlo_bf16 v0, v0, v1, 0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: mixlo_fptrunc: +; GFX1250-REAL16: ; %bb.0: ; %.entry +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] .entry: %mul = fmul float %a, %b %trunc = fptrunc float %mul to bfloat @@ -429,12 +579,21 @@ define bfloat @mixlo_fptrunc(float %a, float %b) #0 { } define bfloat @mixlo_fptrunc_no_flush(float %a, float %b) { -; GFX1250-LABEL: mixlo_fptrunc_no_flush: -; GFX1250: ; %bb.0: ; %.entry -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, 0 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: mixlo_fptrunc_no_flush: +; GFX1250-FAKE16: ; %bb.0: ; %.entry +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_fma_mixlo_bf16 v0, v0, v1, 0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: mixlo_fptrunc_no_flush: +; GFX1250-REAL16: ; %bb.0: ; %.entry +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] .entry: %mul = fmul float %a, %b %trunc = fptrunc float %mul to bfloat @@ -442,12 +601,21 @@ define bfloat @mixlo_fptrunc_no_flush(float %a, float %b) { } define bfloat @mixlo_fptrunc_abs_src_mod(float %a, float %b) #0 { -; GFX1250-LABEL: mixlo_fptrunc_abs_src_mod: -; GFX1250: ; %bb.0: ; %.entry -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_fma_mixlo_bf16 v0, |v0|, v1, 0 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: mixlo_fptrunc_abs_src_mod: +; GFX1250-FAKE16: ; %bb.0: ; %.entry +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_fma_mixlo_bf16 v0, |v0|, v1, 0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: mixlo_fptrunc_abs_src_mod: +; GFX1250-REAL16: ; %bb.0: ; %.entry +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] .entry: %a.fabs = call float @llvm.fabs.f32(float %a) %mul = fmul float %a.fabs, %b @@ -456,12 +624,21 @@ define bfloat @mixlo_fptrunc_abs_src_mod(float %a, float %b) #0 { } define bfloat @mixlo_fptrunc_neg_src_mod(float %a, float %b) #0 { -; GFX1250-LABEL: mixlo_fptrunc_neg_src_mod: -; GFX1250: ; %bb.0: ; %.entry -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_fma_mixlo_bf16 v0, -v0, v1, 0 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: mixlo_fptrunc_neg_src_mod: +; GFX1250-FAKE16: ; %bb.0: ; %.entry +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_fma_mixlo_bf16 v0, -v0, v1, 0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: mixlo_fptrunc_neg_src_mod: +; GFX1250-REAL16: ; %bb.0: ; %.entry +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_mul_f32_e64 v0, -v0, v1 +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] .entry: %a.fneg = fneg float %a %mul = fmul float %a.fneg, %b diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll index a6217bc1612f3..344d1f5d0b854 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90a %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0 { ; GFX942-LABEL: preload_block_count_x: @@ -281,14 +282,23 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr ; GFX90a-NEXT: global_store_short v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm ; -; GFX1250-LABEL: incorrect_type_i16_block_count_x: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GFX1250-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-NEXT: global_load_u16 v1, v0, s[0:1] offset:8 nv -; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: global_store_b16 v0, v1, s[2:3] -; GFX1250-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: incorrect_type_i16_block_count_x: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] offset:8 nv +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: global_store_b16 v0, v1, s[2:3] +; GFX1250-FAKE16-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: incorrect_type_i16_block_count_x: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-REAL16-NEXT: global_load_u16 v0, v1, s[0:1] offset:8 nv +; GFX1250-REAL16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-REAL16-NEXT: global_store_b16 v1, v0, s[2:3] +; GFX1250-REAL16-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %load = load i16, ptr addrspace(4) %imp_arg_ptr store i16 %load, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll index ec8ba1dc65459..c1406e161c698 100644 --- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll +++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll @@ -5,7 +5,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90A %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s ; We want to undo these canonicalizations to enable mad matching: ; (x * y) + x --> x * (y + 1) @@ -847,12 +848,19 @@ define i16 @v_mul_add_1_i16(i16 %x, i16 %y) { ; GFX10-NEXT: v_mad_u16 v0, v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: v_mul_add_1_i16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mul_add_1_i16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mul_add_1_i16: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v0.l +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %add = add i16 %y, 1 %mul = mul i16 %x, %add ret i16 %mul @@ -888,14 +896,22 @@ define i32 @v_mul_add_1_i16_zext_result(i16 %x, i16 %y) { ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: v_mul_add_1_i16_zext_result: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mul_add_1_i16_zext_result: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mul_add_1_i16_zext_result: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v0.l +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %add = add i16 %y, 1 %mul = mul i16 %x, %add %zext = zext i16 %mul to i32 @@ -930,12 +946,19 @@ define i16 @v_mul_add_1_i16_commute(i16 %x, i16 %y) { ; GFX10-NEXT: v_mad_u16 v0, v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: v_mul_add_1_i16_commute: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mul_add_1_i16_commute: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mul_add_1_i16_commute: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v0.l +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %add = add i16 %y, 1 %mul = mul i16 %add, %x ret i16 %mul @@ -968,12 +991,19 @@ define i16 @v_mul_add_x_i16(i16 %x, i16 %y) { ; GFX10-NEXT: v_mad_u16 v0, v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: v_mul_add_x_i16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mul_add_x_i16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mul_add_x_i16: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v0.l +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %mul = mul i16 %x, %y %add = add i16 %x, %mul ret i16 %add @@ -1010,14 +1040,23 @@ define i16 @v_mul_sub_1_i16(i16 %x, i16 %y) { ; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: v_mul_sub_1_i16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_add_nc_u16 v1, v1, -1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mul_sub_1_i16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_add_nc_u16 v1, v1, -1 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mul_sub_1_i16: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_add_nc_u16 v1.l, v1.l, -1 +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %sub = sub i16 %y, 1 %mul = mul i16 %x, %sub ret i16 %mul @@ -1054,14 +1093,23 @@ define i16 @v_mul_sub_1_i16_commute(i16 %x, i16 %y) { ; GFX10-NEXT: v_mul_lo_u16 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: v_mul_sub_1_i16_commute: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_add_nc_u16 v1, v1, -1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_mul_lo_u16 v0, v1, v0 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mul_sub_1_i16_commute: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_add_nc_u16 v1, v1, -1 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_mul_lo_u16 v0, v1, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mul_sub_1_i16_commute: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_add_nc_u16 v1.l, v1.l, -1 +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_mul_lo_u16 v0.l, v1.l, v0.l +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %sub = sub i16 %y, 1 %mul = mul i16 %sub, %x ret i16 %mul @@ -1098,14 +1146,23 @@ define i16 @v_mul_sub_x_i16(i16 %x, i16 %y) { ; GFX10-NEXT: v_sub_nc_u16 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: v_mul_sub_x_i16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mul_lo_u16 v1, v0, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_sub_nc_u16 v0, v1, v0 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mul_sub_x_i16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mul_lo_u16 v1, v0, v1 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_sub_nc_u16 v0, v1, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mul_sub_x_i16: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_mul_lo_u16 v0.h, v0.l, v1.l +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_sub_nc_u16 v0.l, v0.h, v0.l +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %mul = mul i16 %x, %y %sub = sub i16 %mul, %x ret i16 %sub @@ -1142,14 +1199,23 @@ define i16 @v_mul_add_2_i16(i16 %x, i16 %y) { ; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: v_mul_add_2_i16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_add_nc_u16 v1, v1, 2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mul_add_2_i16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_add_nc_u16 v1, v1, 2 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mul_add_2_i16: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_add_nc_u16 v1.l, v1.l, 2 +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %add = add i16 %y, 2 %mul = mul i16 %x, %add ret i16 %mul @@ -1186,14 +1252,23 @@ define i16 @v_mul_sub_2_i16(i16 %x, i16 %y) { ; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: v_mul_sub_2_i16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_add_nc_u16 v1, v1, -2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mul_sub_2_i16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_add_nc_u16 v1, v1, -2 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mul_sub_2_i16: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_add_nc_u16 v1.l, v1.l, -2 +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %sub = sub i16 %y, 2 %mul = mul i16 %x, %sub ret i16 %mul @@ -3288,12 +3363,19 @@ define i16 @v_mul_9_add_52_i16(i16 %arg) { ; GFX10-NEXT: v_mad_u16 v0, v0, 9, 52 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: v_mul_9_add_52_i16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mad_u16 v0, v0, 9, 52 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mul_9_add_52_i16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mad_u16 v0, v0, 9, 52 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mul_9_add_52_i16: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_mad_u16 v0.l, v0.l, 9, 52 +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %mul = mul i16 %arg, 9 %add = add i16 %mul, 52 ret i16 %add @@ -3523,12 +3605,19 @@ define i16 @v_mul_5_add_1_i16(i16 %arg) { ; GFX10-NEXT: v_mad_u16 v0, v0, 5, 1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: v_mul_5_add_1_i16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mad_u16 v0, v0, 5, 1 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mul_5_add_1_i16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mad_u16 v0, v0, 5, 1 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mul_5_add_1_i16: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_mad_u16 v0.l, v0.l, 5, 1 +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %mul = mul i16 %arg, 5 %add = add i16 %mul, 1 ret i16 %add @@ -3567,14 +3656,23 @@ define i16 @v_mul_284_add_82_i16(i16 %arg) { ; GFX10-NEXT: v_mad_u16 v0, v0, s4, 0x52 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: v_mul_284_add_82_i16: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_movk_i32 s0, 0x11c -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-NEXT: v_mad_u16 v0, v0, s0, 0x52 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mul_284_add_82_i16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: s_movk_i32 s0, 0x11c +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-FAKE16-NEXT: v_mad_u16 v0, v0, s0, 0x52 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mul_284_add_82_i16: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v1.l, 0x11c +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 0x52 +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %mul = mul i16 %arg, 284 %add = add i16 %mul, 82 ret i16 %add @@ -4284,12 +4382,19 @@ define i8 @v_mul_add_1_i8(i8 %x, i8 %y) { ; GFX10-NEXT: v_mad_u16 v0, v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: v_mul_add_1_i8: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mul_add_1_i8: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mul_add_1_i8: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v0.l +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %add = add i8 %y, 1 %mul = mul i8 %x, %add ret i8 %mul @@ -4323,12 +4428,19 @@ define i8 @v_mul_add_1_i8_commute(i8 %x, i8 %y) { ; GFX10-NEXT: v_mad_u16 v0, v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: v_mul_add_1_i8_commute: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mul_add_1_i8_commute: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mul_add_1_i8_commute: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v0.l +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %add = add i8 %y, 1 %mul = mul i8 %add, %x ret i8 %mul @@ -4361,12 +4473,19 @@ define i8 @v_mul_add_1_i8_zext(i8 zeroext %x, i8 zeroext %y) { ; GFX10-NEXT: v_mad_u16 v0, v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: v_mul_add_1_i8_zext: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mul_add_1_i8_zext: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mul_add_1_i8_zext: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v0.l +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %add = add i8 %y, 1 %mul = mul i8 %x, %add ret i8 %mul @@ -4399,12 +4518,19 @@ define i8 @v_mul_add_1_i8_zext_commute(i8 zeroext %x, i8 zeroext %y) { ; GFX10-NEXT: v_mad_u16 v0, v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: v_mul_add_1_i8_zext_commute: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mul_add_1_i8_zext_commute: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mul_add_1_i8_zext_commute: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v0.l +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %add = add i8 %y, 1 %mul = mul i8 %add, %x ret i8 %mul @@ -4457,17 +4583,29 @@ define <2 x i8> @v_mul_add_1_v2i8(<2 x i8> %x, <2 x i8> %y) { ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: v_mul_add_1_v2i8: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mad_u16 v1, v1, v3, v1 -; GFX1250-NEXT: v_mad_u16 v0, v0, v2, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_lshlrev_b16 v2, 8, v1 -; GFX1250-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX1250-NEXT: v_bitop3_b16 v0, v0, v2, 0xff bitop3:0xec -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mul_add_1_v2i8: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mad_u16 v1, v1, v3, v1 +; GFX1250-FAKE16-NEXT: v_mad_u16 v0, v0, v2, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v2, 8, v1 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v0, v0, v2, 0xff bitop3:0xec +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mul_add_1_v2i8: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_mad_u16 v0.h, v1.l, v3.l, v1.l +; GFX1250-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v2.l, v0.l +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_lshlrev_b16 v1.l, 8, v0.h +; GFX1250-REAL16-NEXT: v_bitop3_b16 v0.l, v0.l, v1.l, 0xff bitop3:0xec +; GFX1250-REAL16-NEXT: v_and_b16 v1.l, 0xff, v0.h +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %add = add <2 x i8> %y, %mul = mul <2 x i8> %x, %add ret <2 x i8> %mul @@ -4520,17 +4658,29 @@ define <2 x i8> @v_mul_add_1_v2i8_commute(<2 x i8> %x, <2 x i8> %y) { ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250-LABEL: v_mul_add_1_v2i8_commute: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mad_u16 v1, v1, v3, v1 -; GFX1250-NEXT: v_mad_u16 v0, v0, v2, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_lshlrev_b16 v2, 8, v1 -; GFX1250-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX1250-NEXT: v_bitop3_b16 v0, v0, v2, 0xff bitop3:0xec -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-FAKE16-LABEL: v_mul_add_1_v2i8_commute: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_mad_u16 v1, v1, v3, v1 +; GFX1250-FAKE16-NEXT: v_mad_u16 v0, v0, v2, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v2, 8, v1 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX1250-FAKE16-NEXT: v_bitop3_b16 v0, v0, v2, 0xff bitop3:0xec +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-REAL16-LABEL: v_mul_add_1_v2i8_commute: +; GFX1250-REAL16: ; %bb.0: +; GFX1250-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-REAL16-NEXT: v_mad_u16 v0.h, v1.l, v3.l, v1.l +; GFX1250-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v2.l, v0.l +; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-REAL16-NEXT: v_lshlrev_b16 v1.l, 8, v0.h +; GFX1250-REAL16-NEXT: v_bitop3_b16 v0.l, v0.l, v1.l, 0xff bitop3:0xec +; GFX1250-REAL16-NEXT: v_and_b16 v1.l, 0xff, v0.h +; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] %add = add <2 x i8> %y, %mul = mul <2 x i8> %add, %x ret <2 x i8> %mul diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll index c47d074aca865..95d48915e6ac9 100644 --- a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,SDAG %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GCN,SDAG,GCN-FAKE16,SDAG-FAKE16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GCN,GISEL,GCN-FAKE16,GISEL-FAKE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GCN,SDAG,GCN-REAL16,SDAG-REAL16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GCN,GISEL,GCN-REAL16,GISEL-REAL16 %s define amdgpu_ps float @flat_load_b32_idxprom(ptr align 4 inreg %p, i32 %idx) { ; GCN-LABEL: flat_load_b32_idxprom: @@ -300,12 +302,19 @@ entry: } define amdgpu_ps void @flat_store_b16_idxprom(ptr align 2 inreg %p, i32 %idx) { -; GCN-LABEL: flat_store_b16_idxprom: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: v_mov_b32_e32 v1, 1 -; GCN-NEXT: flat_store_b16 v0, v1, s[0:1] scale_offset -; GCN-NEXT: s_endpgm +; GCN-FAKE16-LABEL: flat_store_b16_idxprom: +; GCN-FAKE16: ; %bb.0: ; %entry +; GCN-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GCN-FAKE16-NEXT: v_mov_b32_e32 v1, 1 +; GCN-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1] scale_offset +; GCN-FAKE16-NEXT: s_endpgm +; +; GCN-REAL16-LABEL: flat_store_b16_idxprom: +; GCN-REAL16: ; %bb.0: ; %entry +; GCN-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GCN-REAL16-NEXT: v_mov_b16_e32 v1.l, 1 +; GCN-REAL16-NEXT: flat_store_b16 v0, v1, s[0:1] scale_offset +; GCN-REAL16-NEXT: s_endpgm entry: %idxprom = sext i32 %idx to i64 %arrayidx = getelementptr inbounds i16, ptr %p, i64 %idxprom @@ -436,3 +445,8 @@ entry: } !0 = !{i32 0, i32 1024} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GISEL-FAKE16: {{.*}} +; GISEL-REAL16: {{.*}} +; SDAG-FAKE16: {{.*}} +; SDAG-REAL16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll index 53df8dfeef8ec..5bc27103debba 100644 --- a/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll @@ -1,6 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,SDAG %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GCN,SDAG,GCN-FAKE16,SDAG-FAKE16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GCN,GISEL,GCN-FAKE16,GISEL-FAKE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GCN,SDAG,GCN-REAL16,SDAG-REAL16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GCN,GISEL,GCN-REAL16,GISEL-REAL16 %s + define amdgpu_ps float @global_load_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { ; GCN-LABEL: global_load_b32_idxprom: @@ -300,12 +303,19 @@ entry: } define amdgpu_ps void @global_store_b16_idxprom(ptr addrspace(1) align 2 inreg %p, i32 %idx) { -; GCN-LABEL: global_store_b16_idxprom: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: v_mov_b32_e32 v1, 1 -; GCN-NEXT: global_store_b16 v0, v1, s[0:1] scale_offset -; GCN-NEXT: s_endpgm +; GCN-FAKE16-LABEL: global_store_b16_idxprom: +; GCN-FAKE16: ; %bb.0: ; %entry +; GCN-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GCN-FAKE16-NEXT: v_mov_b32_e32 v1, 1 +; GCN-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] scale_offset +; GCN-FAKE16-NEXT: s_endpgm +; +; GCN-REAL16-LABEL: global_store_b16_idxprom: +; GCN-REAL16: ; %bb.0: ; %entry +; GCN-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GCN-REAL16-NEXT: v_mov_b16_e32 v1.l, 1 +; GCN-REAL16-NEXT: global_store_b16 v0, v1, s[0:1] scale_offset +; GCN-REAL16-NEXT: s_endpgm entry: %idxprom = sext i32 %idx to i64 %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxprom @@ -360,4 +370,8 @@ entry: !0 = !{i32 0, i32 1024} ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GISEL: {{.*}} +; GISEL-FAKE16: {{.*}} +; GISEL-REAL16: {{.*}} ; SDAG: {{.*}} +; SDAG-FAKE16: {{.*}} +; SDAG-REAL16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll index bc88aaf57b0bd..24ae4a022b546 100644 --- a/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GCN,SDAG,GCN-FAKE16,SDAG-FAKE16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GCN,GISEL,GCN-FAKE16,GISEL-FAKE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GCN,SDAG,GCN-REAL16,SDAG-REAL16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GCN,GISEL,GCN-REAL16,GISEL-REAL16 %s define amdgpu_ps float @scratch_load_b32_alloca_idxprom(i32 %idx) { ; GCN-LABEL: scratch_load_b32_alloca_idxprom: @@ -313,12 +315,19 @@ entry: } define amdgpu_ps void @scratch_store_b16_idxprom(ptr addrspace(5) align 2 inreg %p, i32 %idx) { -; GCN-LABEL: scratch_store_b16_idxprom: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: v_mov_b32_e32 v1, 1 -; GCN-NEXT: scratch_store_b16 v0, v1, s0 scale_offset -; GCN-NEXT: s_endpgm +; GCN-FAKE16-LABEL: scratch_store_b16_idxprom: +; GCN-FAKE16: ; %bb.0: ; %entry +; GCN-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GCN-FAKE16-NEXT: v_mov_b32_e32 v1, 1 +; GCN-FAKE16-NEXT: scratch_store_b16 v0, v1, s0 scale_offset +; GCN-FAKE16-NEXT: s_endpgm +; +; GCN-REAL16-LABEL: scratch_store_b16_idxprom: +; GCN-REAL16: ; %bb.0: ; %entry +; GCN-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GCN-REAL16-NEXT: v_mov_b16_e32 v1.l, 1 +; GCN-REAL16-NEXT: scratch_store_b16 v0, v1, s0 scale_offset +; GCN-REAL16-NEXT: s_endpgm entry: %idxprom = zext i32 %idx to i64 %arrayidx = getelementptr inbounds i16, ptr addrspace(5) %p, i64 %idxprom @@ -341,3 +350,10 @@ entry: } !0 = !{i32 0, i32 1024} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GISEL: {{.*}} +; GISEL-FAKE16: {{.*}} +; GISEL-REAL16: {{.*}} +; SDAG: {{.*}} +; SDAG-FAKE16: {{.*}} +; SDAG-REAL16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir index e27d22558b755..f018ca040a304 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir +++ b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250.mir @@ -1,5 +1,5 @@ -# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -start-before=amdgpu-lower-vgpr-encoding -o - %s | FileCheck -check-prefixes=GCN,ASM %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -start-before=amdgpu-lower-vgpr-encoding -o - %s | llvm-mc -triple=amdgcn -mcpu=gfx1250 -filetype=obj -o - | llvm-objdump -d --mcpu=gfx1250 - | FileCheck -check-prefixes=GCN,DIS %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -start-before=amdgpu-lower-vgpr-encoding -o - %s | FileCheck -check-prefixes=GCN,ASM %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -start-before=amdgpu-lower-vgpr-encoding -o - %s | llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -filetype=obj -o - | llvm-objdump -d --mcpu=gfx1250 --mattr=-real-true16 - | FileCheck -check-prefixes=GCN,DIS %s # ASM-LABEL: {{^}}high_vgprs: # DIS-LABEL: :