diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp index 86af897943dae..9ae043048b932 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp @@ -127,92 +127,8 @@ class LiveRegOptimizer { return LK.first != TargetLoweringBase::TypeLegal; } - /// Check if intrinsic natively operates on 8-bit or 16-bit - bool isNativeIntrinsic(Intrinsic::ID ID) { - switch (ID) { - case Intrinsic::amdgcn_dot4_f32_fp8_bf8: - case Intrinsic::amdgcn_dot4_f32_bf8_fp8: - case Intrinsic::amdgcn_dot4_f32_fp8_fp8: - case Intrinsic::amdgcn_dot4_f32_bf8_bf8: - case Intrinsic::amdgcn_mfma_i32_4x4x4i8: - case Intrinsic::amdgcn_mfma_i32_16x16x4i8: - case Intrinsic::amdgcn_mfma_i32_32x32x4i8: - case Intrinsic::amdgcn_mfma_i32_16x16x16i8: - case Intrinsic::amdgcn_mfma_i32_32x32x8i8: - case Intrinsic::amdgcn_mfma_i32_16x16x64_i8: - case Intrinsic::amdgcn_mfma_i32_32x32x32_i8: - case Intrinsic::amdgcn_mfma_i32_32x32x16_i8: - case Intrinsic::amdgcn_mfma_i32_16x16x32_i8: - case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8: - case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8: - case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8: - case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8: - case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8: - case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8: - case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8: - case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: - case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: - case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: - case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8: - case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8: - case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8: - case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8: - case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8: - case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8: - case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8: - case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: - case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8: - case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8: - case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8: - case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8: - case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8: - case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8: - case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8: - case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8: - case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8: - case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8: - case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8: - case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8: - case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8: - case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8: - case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8: - case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8: - case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8: - case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: - case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8: - case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4: - case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4: - case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: - case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: - case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: - case Intrinsic::amdgcn_raw_buffer_store_format: - case Intrinsic::amdgcn_raw_buffer_store: - case Intrinsic::amdgcn_raw_ptr_buffer_store_format: - case Intrinsic::amdgcn_raw_ptr_buffer_store: - case Intrinsic::amdgcn_struct_buffer_store_format: - case Intrinsic::amdgcn_struct_buffer_store: - case Intrinsic::amdgcn_struct_ptr_buffer_store_format: - case Intrinsic::amdgcn_struct_ptr_buffer_store: - case Intrinsic::amdgcn_raw_tbuffer_store: - case Intrinsic::amdgcn_raw_ptr_tbuffer_store: - case Intrinsic::amdgcn_struct_ptr_tbuffer_store: - case Intrinsic::amdgcn_struct_tbuffer_store: - return true; - default: - return false; - } - } - bool isOpLegal(Instruction *I) { - if (const auto *Intr = dyn_cast(I)) { - Intrinsic::ID ID = Intr->getIntrinsicID(); - if (isNativeIntrinsic(ID)) - return true; - } - // Stores - if (isa(I)) - return true; - return false; + return isa(I) || isa(I); } bool isCoercionProfitable(Instruction *II) { diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index ee6a63fc1f7e1..a401f989a2507 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -1,37 +1,40 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefix=GFX942 %s define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { -; GFX906-LABEL: v3i8_liveout: -; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX906-NEXT: v_mov_b32_e32 v3, 8 -; GFX906-NEXT: s_mov_b32 s4, 0xff0000 -; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dword v4, v2, s[0:1] -; GFX906-NEXT: v_mov_b32_e32 v1, 0 -; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_sdwa v5, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX906-NEXT: v_and_or_b32 v4, v4, s4, v5 -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX906-NEXT: s_cbranch_execz .LBB0_2 -; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dword v0, v2, s[2:3] -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_sdwa v2, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX906-NEXT: v_and_or_b32 v4, v0, s4, v2 -; GFX906-NEXT: .LBB0_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: global_store_byte_d16_hi v1, v4, s[6:7] offset:2 -; GFX906-NEXT: global_store_short v1, v4, s[6:7] -; GFX906-NEXT: s_endpgm +; GFX942-LABEL: v3i8_liveout: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, 8 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_load_dword v3, v1, s[0:1] +; GFX942-NEXT: s_mov_b32 s4, 0xff0000 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v5, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_or_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX942-NEXT: v_and_or_b32 v3, v3, s4, v5 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_cbranch_execz .LBB0_2 +; GFX942-NEXT: ; %bb.1: ; %bb.1 +; GFX942-NEXT: global_load_dword v1, v1, s[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX942-NEXT: v_and_or_b32 v3, v1, s4, v2 +; GFX942-NEXT: .LBB0_2: ; %bb.2 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: global_store_byte_d16_hi v0, v3, s[6:7] offset:2 +; GFX942-NEXT: global_store_short v0, v3, s[6:7] +; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <3 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -50,24 +53,25 @@ bb.2: } define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { -; GFX906-LABEL: v4i8_liveout: -; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v3, 2, v0 -; GFX906-NEXT: v_mov_b32_e32 v1, 0 -; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 -; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dword v2, v3, s[0:1] -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX906-NEXT: s_cbranch_execz .LBB1_2 -; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dword v2, v3, s[2:3] -; GFX906-NEXT: .LBB1_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dword v1, v2, s[6:7] -; GFX906-NEXT: s_endpgm +; GFX942-LABEL: v4i8_liveout: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: v_and_b32_e32 v3, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 2, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_load_dword v1, v2, s[0:1] +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v3 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_cbranch_execz .LBB1_2 +; GFX942-NEXT: ; %bb.1: ; %bb.1 +; GFX942-NEXT: global_load_dword v1, v2, s[2:3] +; GFX942-NEXT: .LBB1_2: ; %bb.2 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dword v0, v1, s[6:7] +; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -86,28 +90,29 @@ bb.2: } define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { -; GFX906-LABEL: v5i8_liveout: -; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; GFX906-NEXT: v_mov_b32_e32 v3, 0 -; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 -; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[0:1] -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX906-NEXT: s_cbranch_execz .LBB2_2 -; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[2:3] -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX906-NEXT: .LBB2_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: global_store_byte v3, v2, s[6:7] offset:4 -; GFX906-NEXT: global_store_dword v3, v1, s[6:7] -; GFX906-NEXT: s_endpgm +; GFX942-LABEL: v5i8_liveout: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_cbranch_execz .LBB2_2 +; GFX942-NEXT: ; %bb.1: ; %bb.1 +; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX942-NEXT: .LBB2_2: ; %bb.2 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: global_store_byte v2, v1, s[6:7] offset:4 +; GFX942-NEXT: global_store_dword v2, v0, s[6:7] +; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <5 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -126,24 +131,25 @@ bb.2: } define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { -; GFX906-LABEL: v8i8_liveout: -; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; GFX906-NEXT: v_mov_b32_e32 v3, 0 -; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 -; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[0:1] -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX906-NEXT: s_cbranch_execz .LBB3_2 -; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[2:3] -; GFX906-NEXT: .LBB3_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx2 v3, v[1:2], s[6:7] -; GFX906-NEXT: s_endpgm +; GFX942-LABEL: v8i8_liveout: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_cbranch_execz .LBB3_2 +; GFX942-NEXT: ; %bb.1: ; %bb.1 +; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] +; GFX942-NEXT: .LBB3_2: ; %bb.2 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -162,24 +168,25 @@ bb.2: } define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { -; GFX906-LABEL: v16i8_liveout: -; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v6, 4, v0 -; GFX906-NEXT: v_mov_b32_e32 v5, 0 -; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 -; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx4 v[1:4], v6, s[0:1] -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX906-NEXT: s_cbranch_execz .LBB4_2 -; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx4 v[1:4], v6, s[2:3] -; GFX906-NEXT: .LBB4_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx4 v5, v[1:4], s[6:7] -; GFX906-NEXT: s_endpgm +; GFX942-LABEL: v16i8_liveout: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: v_and_b32_e32 v6, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v5, 4, v6 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_load_dwordx4 v[0:3], v5, s[0:1] +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v6 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_cbranch_execz .LBB4_2 +; GFX942-NEXT: ; %bb.1: ; %bb.1 +; GFX942-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] +; GFX942-NEXT: .LBB4_2: ; %bb.2 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <16 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -198,28 +205,29 @@ bb.2: } define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { -; GFX906-LABEL: v32i8_liveout: -; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v10, 5, v0 -; GFX906-NEXT: v_mov_b32_e32 v9, 0 -; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 -; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx4 v[5:8], v10, s[0:1] offset:16 -; GFX906-NEXT: global_load_dwordx4 v[1:4], v10, s[0:1] -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX906-NEXT: s_cbranch_execz .LBB5_2 -; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx4 v[5:8], v10, s[2:3] offset:16 -; GFX906-NEXT: global_load_dwordx4 v[1:4], v10, s[2:3] -; GFX906-NEXT: .LBB5_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[6:7] offset:16 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: global_store_dwordx4 v9, v[1:4], s[6:7] -; GFX906-NEXT: s_endpgm +; GFX942-LABEL: v32i8_liveout: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: v_and_b32_e32 v10, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v9, 5, v10 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_load_dwordx4 v[4:7], v9, s[0:1] offset:16 +; GFX942-NEXT: global_load_dwordx4 v[0:3], v9, s[0:1] +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v10 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_cbranch_execz .LBB5_2 +; GFX942-NEXT: ; %bb.1: ; %bb.1 +; GFX942-NEXT: global_load_dwordx4 v[4:7], v9, s[2:3] offset:16 +; GFX942-NEXT: global_load_dwordx4 v[0:3], v9, s[2:3] +; GFX942-NEXT: .LBB5_2: ; %bb.2 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(1) +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] offset:16 +; GFX942-NEXT: s_waitcnt vmcnt(1) +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7] +; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -238,101 +246,77 @@ bb.2: } define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { -; GFX906-LABEL: v256i8_liveout: -; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX906-NEXT: v_lshlrev_b32_e32 v61, 3, v0 -; GFX906-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX906-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx4 v[13:16], v61, s[0:1] offset:240 -; GFX906-NEXT: global_load_dwordx4 v[9:12], v61, s[0:1] offset:224 -; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[0:1] offset:208 -; GFX906-NEXT: global_load_dwordx4 v[17:20], v61, s[0:1] offset:192 -; GFX906-NEXT: s_mov_b32 s14, -1 -; GFX906-NEXT: s_mov_b32 s15, 0xe00000 -; GFX906-NEXT: s_add_u32 s12, s12, s11 -; GFX906-NEXT: s_addc_u32 s13, s13, 0 -; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 -; GFX906-NEXT: v_mov_b32_e32 v4, 0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v17, off, s[12:15], 0 ; 4-byte Folded Spill -; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[29:32], v61, s[0:1] offset:176 -; GFX906-NEXT: global_load_dwordx4 v[25:28], v61, s[0:1] offset:160 -; GFX906-NEXT: global_load_dwordx4 v[21:24], v61, s[0:1] offset:144 -; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: global_load_dwordx4 v[17:20], v61, s[0:1] offset:128 -; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[0:1] offset:112 -; GFX906-NEXT: global_load_dwordx4 v[57:60], v61, s[0:1] offset:96 -; GFX906-NEXT: global_load_dwordx4 v[53:56], v61, s[0:1] offset:80 -; GFX906-NEXT: global_load_dwordx4 v[49:52], v61, s[0:1] offset:64 -; GFX906-NEXT: global_load_dwordx4 v[45:48], v61, s[0:1] offset:48 -; GFX906-NEXT: global_load_dwordx4 v[41:44], v61, s[0:1] offset:32 -; GFX906-NEXT: global_load_dwordx4 v[37:40], v61, s[0:1] offset:16 -; GFX906-NEXT: global_load_dwordx4 v[33:36], v61, s[0:1] -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX906-NEXT: s_cbranch_execz .LBB6_2 -; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx4 v[13:16], v61, s[2:3] offset:240 -; GFX906-NEXT: global_load_dwordx4 v[9:12], v61, s[2:3] offset:224 -; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[2:3] offset:208 -; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[2:3] offset:192 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill -; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[29:32], v61, s[2:3] offset:176 -; GFX906-NEXT: global_load_dwordx4 v[25:28], v61, s[2:3] offset:160 -; GFX906-NEXT: global_load_dwordx4 v[21:24], v61, s[2:3] offset:144 -; GFX906-NEXT: global_load_dwordx4 v[17:20], v61, s[2:3] offset:128 -; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[2:3] offset:112 -; GFX906-NEXT: global_load_dwordx4 v[57:60], v61, s[2:3] offset:96 -; GFX906-NEXT: global_load_dwordx4 v[53:56], v61, s[2:3] offset:80 -; GFX906-NEXT: global_load_dwordx4 v[49:52], v61, s[2:3] offset:64 -; GFX906-NEXT: global_load_dwordx4 v[45:48], v61, s[2:3] offset:48 -; GFX906-NEXT: global_load_dwordx4 v[41:44], v61, s[2:3] offset:32 -; GFX906-NEXT: global_load_dwordx4 v[37:40], v61, s[2:3] offset:16 -; GFX906-NEXT: global_load_dwordx4 v[33:36], v61, s[2:3] -; GFX906-NEXT: .LBB6_2: ; %bb.2 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: s_waitcnt vmcnt(7) -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:112 -; GFX906-NEXT: s_waitcnt vmcnt(7) -; GFX906-NEXT: global_store_dwordx4 v4, v[57:60], s[6:7] offset:96 -; GFX906-NEXT: s_waitcnt vmcnt(7) -; GFX906-NEXT: global_store_dwordx4 v4, v[53:56], s[6:7] offset:80 -; GFX906-NEXT: s_waitcnt vmcnt(7) -; GFX906-NEXT: global_store_dwordx4 v4, v[49:52], s[6:7] offset:64 -; GFX906-NEXT: s_waitcnt vmcnt(7) -; GFX906-NEXT: global_store_dwordx4 v4, v[45:48], s[6:7] offset:48 -; GFX906-NEXT: s_waitcnt vmcnt(7) -; GFX906-NEXT: global_store_dwordx4 v4, v[41:44], s[6:7] offset:32 -; GFX906-NEXT: s_waitcnt vmcnt(7) -; GFX906-NEXT: global_store_dwordx4 v4, v[37:40], s[6:7] offset:16 -; GFX906-NEXT: s_waitcnt vmcnt(7) -; GFX906-NEXT: global_store_dwordx4 v4, v[33:36], s[6:7] -; GFX906-NEXT: global_store_dwordx4 v4, v[13:16], s[6:7] offset:240 -; GFX906-NEXT: global_store_dwordx4 v4, v[9:12], s[6:7] offset:224 -; GFX906-NEXT: global_store_dwordx4 v4, v[5:8], s[6:7] offset:208 -; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:192 -; GFX906-NEXT: global_store_dwordx4 v4, v[29:32], s[6:7] offset:176 -; GFX906-NEXT: global_store_dwordx4 v4, v[25:28], s[6:7] offset:160 -; GFX906-NEXT: global_store_dwordx4 v4, v[21:24], s[6:7] offset:144 -; GFX906-NEXT: global_store_dwordx4 v4, v[17:20], s[6:7] offset:128 -; GFX906-NEXT: s_endpgm +; GFX942-LABEL: v256i8_liveout: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_load_dwordx4 v[28:31], v1, s[0:1] offset:240 +; GFX942-NEXT: global_load_dwordx4 v[24:27], v1, s[0:1] offset:224 +; GFX942-NEXT: global_load_dwordx4 v[20:23], v1, s[0:1] offset:208 +; GFX942-NEXT: global_load_dwordx4 v[16:19], v1, s[0:1] offset:192 +; GFX942-NEXT: global_load_dwordx4 v[12:15], v1, s[0:1] offset:176 +; GFX942-NEXT: global_load_dwordx4 v[8:11], v1, s[0:1] offset:160 +; GFX942-NEXT: global_load_dwordx4 v[4:7], v1, s[0:1] offset:144 +; GFX942-NEXT: global_load_dwordx4 a[0:3], v1, s[0:1] offset:128 +; GFX942-NEXT: global_load_dwordx4 v[60:63], v1, s[0:1] offset:112 +; GFX942-NEXT: global_load_dwordx4 v[56:59], v1, s[0:1] offset:96 +; GFX942-NEXT: global_load_dwordx4 v[52:55], v1, s[0:1] offset:80 +; GFX942-NEXT: global_load_dwordx4 v[48:51], v1, s[0:1] offset:64 +; GFX942-NEXT: global_load_dwordx4 v[44:47], v1, s[0:1] offset:48 +; GFX942-NEXT: global_load_dwordx4 v[40:43], v1, s[0:1] offset:32 +; GFX942-NEXT: global_load_dwordx4 v[36:39], v1, s[0:1] offset:16 +; GFX942-NEXT: global_load_dwordx4 v[32:35], v1, s[0:1] +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v2 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_cbranch_execz .LBB6_2 +; GFX942-NEXT: ; %bb.1: ; %bb.1 +; GFX942-NEXT: global_load_dwordx4 v[28:31], v1, s[2:3] offset:240 +; GFX942-NEXT: global_load_dwordx4 v[24:27], v1, s[2:3] offset:224 +; GFX942-NEXT: global_load_dwordx4 v[20:23], v1, s[2:3] offset:208 +; GFX942-NEXT: global_load_dwordx4 v[16:19], v1, s[2:3] offset:192 +; GFX942-NEXT: global_load_dwordx4 v[12:15], v1, s[2:3] offset:176 +; GFX942-NEXT: global_load_dwordx4 v[8:11], v1, s[2:3] offset:160 +; GFX942-NEXT: global_load_dwordx4 v[4:7], v1, s[2:3] offset:144 +; GFX942-NEXT: global_load_dwordx4 a[0:3], v1, s[2:3] offset:128 +; GFX942-NEXT: global_load_dwordx4 v[60:63], v1, s[2:3] offset:112 +; GFX942-NEXT: global_load_dwordx4 v[56:59], v1, s[2:3] offset:96 +; GFX942-NEXT: global_load_dwordx4 v[52:55], v1, s[2:3] offset:80 +; GFX942-NEXT: global_load_dwordx4 v[48:51], v1, s[2:3] offset:64 +; GFX942-NEXT: global_load_dwordx4 v[44:47], v1, s[2:3] offset:48 +; GFX942-NEXT: global_load_dwordx4 v[40:43], v1, s[2:3] offset:32 +; GFX942-NEXT: global_load_dwordx4 v[36:39], v1, s[2:3] offset:16 +; GFX942-NEXT: global_load_dwordx4 v[32:35], v1, s[2:3] +; GFX942-NEXT: .LBB6_2: ; %bb.2 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(7) +; GFX942-NEXT: global_store_dwordx4 v0, v[60:63], s[6:7] offset:112 +; GFX942-NEXT: s_waitcnt vmcnt(7) +; GFX942-NEXT: global_store_dwordx4 v0, v[56:59], s[6:7] offset:96 +; GFX942-NEXT: s_waitcnt vmcnt(7) +; GFX942-NEXT: global_store_dwordx4 v0, v[52:55], s[6:7] offset:80 +; GFX942-NEXT: s_waitcnt vmcnt(7) +; GFX942-NEXT: global_store_dwordx4 v0, v[48:51], s[6:7] offset:64 +; GFX942-NEXT: s_waitcnt vmcnt(7) +; GFX942-NEXT: global_store_dwordx4 v0, v[44:47], s[6:7] offset:48 +; GFX942-NEXT: s_waitcnt vmcnt(7) +; GFX942-NEXT: global_store_dwordx4 v0, v[40:43], s[6:7] offset:32 +; GFX942-NEXT: s_waitcnt vmcnt(7) +; GFX942-NEXT: global_store_dwordx4 v0, v[36:39], s[6:7] offset:16 +; GFX942-NEXT: s_waitcnt vmcnt(7) +; GFX942-NEXT: global_store_dwordx4 v0, v[32:35], s[6:7] +; GFX942-NEXT: global_store_dwordx4 v0, v[28:31], s[6:7] offset:240 +; GFX942-NEXT: global_store_dwordx4 v0, v[24:27], s[6:7] offset:224 +; GFX942-NEXT: global_store_dwordx4 v0, v[20:23], s[6:7] offset:208 +; GFX942-NEXT: global_store_dwordx4 v0, v[16:19], s[6:7] offset:192 +; GFX942-NEXT: global_store_dwordx4 v0, v[12:15], s[6:7] offset:176 +; GFX942-NEXT: global_store_dwordx4 v0, v[8:11], s[6:7] offset:160 +; GFX942-NEXT: global_store_dwordx4 v0, v[4:7], s[6:7] offset:144 +; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] offset:128 +; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -351,33 +335,34 @@ bb.2: } define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { -; GFX906-LABEL: repeat_successor: -; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dword s8, s[4:5], 0x24 -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c -; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_cmp_lt_i32 s8, 3 -; GFX906-NEXT: s_cbranch_scc0 .LBB7_3 -; GFX906-NEXT: ; %bb.1: ; %LeafBlock -; GFX906-NEXT: s_cmp_gt_i32 s8, 0 -; GFX906-NEXT: s_cbranch_scc0 .LBB7_6 -; GFX906-NEXT: ; %bb.2: -; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX906-NEXT: global_load_dword v0, v0, s[0:1] -; GFX906-NEXT: s_branch .LBB7_5 -; GFX906-NEXT: .LBB7_3: ; %LeafBlock5 -; GFX906-NEXT: s_cmp_eq_u32 s8, 3 -; GFX906-NEXT: s_cbranch_scc0 .LBB7_6 -; GFX906-NEXT: ; %bb.4: ; %sw.bb5 -; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX906-NEXT: global_load_dword v0, v0, s[2:3] -; GFX906-NEXT: .LBB7_5: ; %return.sink.split -; GFX906-NEXT: v_mov_b32_e32 v1, 0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dword v1, v0, s[6:7] -; GFX906-NEXT: .LBB7_6: ; %return -; GFX906-NEXT: s_endpgm +; GFX942-LABEL: repeat_successor: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dword s8, s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_cmp_lt_i32 s8, 3 +; GFX942-NEXT: s_cbranch_scc0 .LBB7_3 +; GFX942-NEXT: ; %bb.1: ; %LeafBlock +; GFX942-NEXT: s_cmp_gt_i32 s8, 0 +; GFX942-NEXT: s_cbranch_scc0 .LBB7_6 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX942-NEXT: global_load_dword v0, v0, s[0:1] +; GFX942-NEXT: s_branch .LBB7_5 +; GFX942-NEXT: .LBB7_3: ; %LeafBlock5 +; GFX942-NEXT: s_cmp_eq_u32 s8, 3 +; GFX942-NEXT: s_cbranch_scc0 .LBB7_6 +; GFX942-NEXT: ; %bb.4: ; %sw.bb5 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX942-NEXT: global_load_dword v0, v0, s[2:3] +; GFX942-NEXT: .LBB7_5: ; %return.sink.split +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dword v1, v0, s[6:7] +; GFX942-NEXT: .LBB7_6: ; %return +; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -403,36 +388,37 @@ return: } define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { -; GFX906-LABEL: v8i8_phi_chain: -; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 -; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0 -; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 -; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[8:9] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX906-NEXT: s_cbranch_execz .LBB8_2 -; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[10:11] -; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 -; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec -; GFX906-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX906-NEXT: .LBB8_2: ; %Flow -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] -; GFX906-NEXT: s_cbranch_execz .LBB8_4 -; GFX906-NEXT: ; %bb.3: ; %bb.2 -; GFX906-NEXT: v_mov_b32_e32 v0, 0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[12:13] -; GFX906-NEXT: .LBB8_4: ; %bb.3 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: v_mov_b32_e32 v0, 0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[14:15] -; GFX906-NEXT: s_endpgm +; GFX942-LABEL: v8i8_phi_chain: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v2 +; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v2 +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v2 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[8:9] +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-NEXT: s_cbranch_execz .LBB8_2 +; GFX942-NEXT: ; %bb.1: ; %bb.1 +; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[10:11] +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v2 +; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX942-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX942-NEXT: .LBB8_2: ; %Flow +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB8_4 +; GFX942-NEXT: ; %bb.3: ; %bb.2 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[12:13] +; GFX942-NEXT: .LBB8_4: ; %bb.3 +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[14:15] +; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -458,42 +444,42 @@ bb.3: define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { -; GFX906-LABEL: v8i8_phi_zeroinit: -; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0 -; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0 -; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 -; GFX906-NEXT: ; implicit-def: $vgpr3_vgpr4 -; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[8:9] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX906-NEXT: s_cbranch_execz .LBB9_2 -; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[10:11] -; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_mov_b32_e32 v1, 0 -; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec -; GFX906-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX906-NEXT: v_mov_b32_e32 v2, v1 -; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX906-NEXT: .LBB9_2: ; %Flow -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] -; GFX906-NEXT: s_cbranch_execz .LBB9_4 -; GFX906-NEXT: ; %bb.3: ; %bb.2 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v4, v2 -; GFX906-NEXT: v_mov_b32_e32 v0, 0 -; GFX906-NEXT: v_mov_b32_e32 v3, v1 -; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[12:13] -; GFX906-NEXT: .LBB9_4: ; %bb.3 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: v_mov_b32_e32 v0, 0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[14:15] -; GFX906-NEXT: s_endpgm +; GFX942-LABEL: v8i8_phi_zeroinit: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v4 +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_load_dwordx2 v[0:1], v5, s[8:9] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-NEXT: s_cbranch_execz .LBB9_2 +; GFX942-NEXT: ; %bb.1: ; %bb.1 +; GFX942-NEXT: global_load_dwordx2 v[2:3], v5, s[10:11] +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v4 +; GFX942-NEXT: s_waitcnt vmcnt(1) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX942-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX942-NEXT: .LBB9_2: ; %Flow +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB9_4 +; GFX942-NEXT: ; %bb.3: ; %bb.2 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[12:13] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-NEXT: .LBB9_4: ; %bb.3 +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[14:15] +; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -518,91 +504,92 @@ bb.3: } define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { -; GFX906-LABEL: v8i8_phi_const: -; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0 -; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 -; GFX906-NEXT: ; implicit-def: $vgpr3 -; GFX906-NEXT: ; implicit-def: $vgpr13 -; GFX906-NEXT: ; implicit-def: $vgpr11 -; GFX906-NEXT: ; implicit-def: $vgpr14 -; GFX906-NEXT: ; implicit-def: $vgpr15 -; GFX906-NEXT: ; implicit-def: $vgpr12 -; GFX906-NEXT: ; implicit-def: $vgpr16 -; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[8:9] -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v5, 24, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v7, 8, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v8, 24, v1 -; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX906-NEXT: v_lshrrev_b32_e32 v10, 8, v1 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX906-NEXT: s_cbranch_execz .LBB10_2 -; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx2 v[3:4], v4, s[10:11] -; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 -; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec -; GFX906-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX906-NEXT: v_mov_b32_e32 v5, 8 -; GFX906-NEXT: v_mov_b32_e32 v6, 7 -; GFX906-NEXT: v_mov_b32_e32 v7, 6 -; GFX906-NEXT: v_mov_b32_e32 v2, 5 -; GFX906-NEXT: v_mov_b32_e32 v8, 4 -; GFX906-NEXT: v_mov_b32_e32 v9, 3 -; GFX906-NEXT: v_mov_b32_e32 v10, 2 -; GFX906-NEXT: v_mov_b32_e32 v1, 1 -; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v16, 24, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v15, 8, v4 -; GFX906-NEXT: v_lshrrev_b32_e32 v14, 24, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX906-NEXT: v_lshrrev_b32_e32 v13, 8, v3 -; GFX906-NEXT: .LBB10_2: ; %Flow -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] -; GFX906-NEXT: s_cbranch_execz .LBB10_4 -; GFX906-NEXT: ; %bb.3: ; %bb.2 -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v10 -; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v8 -; GFX906-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v7 -; GFX906-NEXT: v_lshlrev_b16_e32 v11, 8, v5 -; GFX906-NEXT: v_or_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v11, v6, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_mov_b32_e32 v0, 0 -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[12:13] -; GFX906-NEXT: v_mov_b32_e32 v3, v1 -; GFX906-NEXT: v_mov_b32_e32 v13, v10 -; GFX906-NEXT: v_mov_b32_e32 v11, v9 -; GFX906-NEXT: v_mov_b32_e32 v14, v8 -; GFX906-NEXT: v_mov_b32_e32 v4, v2 -; GFX906-NEXT: v_mov_b32_e32 v15, v7 -; GFX906-NEXT: v_mov_b32_e32 v12, v6 -; GFX906-NEXT: v_mov_b32_e32 v16, v5 -; GFX906-NEXT: .LBB10_4: ; %bb.3 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v13 -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v14 -; GFX906-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v15 -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v16 -; GFX906-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_mov_b32_e32 v2, 0 -; GFX906-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dwordx2 v2, v[0:1], s[14:15] -; GFX906-NEXT: s_endpgm +; GFX942-LABEL: v8i8_phi_const: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX942-NEXT: v_and_b32_e32 v16, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v16 +; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v16 +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v16 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[8:9] +; GFX942-NEXT: ; implicit-def: $vgpr2 +; GFX942-NEXT: ; implicit-def: $vgpr12 +; GFX942-NEXT: ; implicit-def: $vgpr10 +; GFX942-NEXT: ; implicit-def: $vgpr13 +; GFX942-NEXT: ; implicit-def: $vgpr14 +; GFX942-NEXT: ; implicit-def: $vgpr11 +; GFX942-NEXT: ; implicit-def: $vgpr15 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX942-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX942-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX942-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX942-NEXT: v_lshrrev_b32_e32 v9, 8, v0 +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-NEXT: s_cbranch_execz .LBB10_2 +; GFX942-NEXT: ; %bb.1: ; %bb.1 +; GFX942-NEXT: global_load_dwordx2 v[2:3], v3, s[10:11] +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v16 +; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; GFX942-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX942-NEXT: v_mov_b32_e32 v4, 8 +; GFX942-NEXT: v_mov_b32_e32 v5, 7 +; GFX942-NEXT: v_mov_b32_e32 v6, 6 +; GFX942-NEXT: v_mov_b32_e32 v1, 5 +; GFX942-NEXT: v_mov_b32_e32 v7, 4 +; GFX942-NEXT: v_mov_b32_e32 v8, 3 +; GFX942-NEXT: v_mov_b32_e32 v9, 2 +; GFX942-NEXT: v_mov_b32_e32 v0, 1 +; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v14, 8, v3 +; GFX942-NEXT: v_lshrrev_b32_e32 v13, 24, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX942-NEXT: v_lshrrev_b32_e32 v12, 8, v2 +; GFX942-NEXT: .LBB10_2: ; %Flow +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GFX942-NEXT: s_cbranch_execz .LBB10_4 +; GFX942-NEXT: ; %bb.3: ; %bb.2 +; GFX942-NEXT: v_lshlrev_b16_e32 v2, 8, v9 +; GFX942-NEXT: v_lshlrev_b16_e32 v3, 8, v7 +; GFX942-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-NEXT: v_lshlrev_b16_e32 v11, 8, v4 +; GFX942-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX942-NEXT: v_lshlrev_b16_e32 v3, 8, v6 +; GFX942-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-NEXT: v_or_b32_sdwa v11, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_or_b32_sdwa v3, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[12:13] +; GFX942-NEXT: v_mov_b32_e32 v2, v0 +; GFX942-NEXT: v_mov_b32_e32 v12, v9 +; GFX942-NEXT: v_mov_b32_e32 v10, v8 +; GFX942-NEXT: v_mov_b32_e32 v13, v7 +; GFX942-NEXT: v_mov_b32_e32 v3, v1 +; GFX942-NEXT: v_mov_b32_e32 v14, v6 +; GFX942-NEXT: v_mov_b32_e32 v11, v5 +; GFX942-NEXT: v_mov_b32_e32 v15, v4 +; GFX942-NEXT: .LBB10_4: ; %bb.3 +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: v_lshlrev_b16_e32 v0, 8, v12 +; GFX942-NEXT: v_lshlrev_b16_e32 v1, 8, v13 +; GFX942-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-NEXT: v_lshlrev_b16_e32 v2, 8, v15 +; GFX942-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX942-NEXT: v_lshlrev_b16_e32 v1, 8, v14 +; GFX942-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[14:15] +; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -627,34 +614,34 @@ bb.3: } define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { -; GFX906-LABEL: v8i8_multi_block: -; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX906-NEXT: v_lshlrev_b32_e32 v6, 3, v0 -; GFX906-NEXT: v_mov_b32_e32 v5, 0 -; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 -; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[3:4], v6, s[8:9] -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v1, v3 -; GFX906-NEXT: v_mov_b32_e32 v2, v4 -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX906-NEXT: s_cbranch_execz .LBB11_4 -; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx2 v[1:2], v6, s[10:11] -; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX906-NEXT: s_cbranch_execz .LBB11_3 -; GFX906-NEXT: ; %bb.2: ; %bb.2 -; GFX906-NEXT: v_mov_b32_e32 v0, 0 -; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[12:13] -; GFX906-NEXT: .LBB11_3: ; %Flow -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: .LBB11_4: ; %bb.3 -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx2 v5, v[1:2], s[14:15] -; GFX906-NEXT: s_endpgm +; GFX942-LABEL: v8i8_multi_block: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX942-NEXT: v_and_b32_e32 v5, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v6, 3, v5 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v5 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_load_dwordx2 v[2:3], v6, s[8:9] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[2:3] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_cbranch_execz .LBB11_4 +; GFX942-NEXT: ; %bb.1: ; %bb.1 +; GFX942-NEXT: global_load_dwordx2 v[0:1], v6, s[10:11] +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v5 +; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-NEXT: s_cbranch_execz .LBB11_3 +; GFX942-NEXT: ; %bb.2: ; %bb.2 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[12:13] +; GFX942-NEXT: .LBB11_3: ; %Flow +; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: .LBB11_4: ; %bb.3 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[14:15] +; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -678,31 +665,32 @@ bb.3: } define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { -; GFX906-LABEL: v32i8_loop_carried: -; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX906-NEXT: v_lshlrev_b32_e32 v1, 5, v0 -; GFX906-NEXT: v_cmp_lt_u32_e32 vcc, 14, v0 -; GFX906-NEXT: s_mov_b32 s2, 0x2000604 -; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dword v1, v1, s[0:1] -; GFX906-NEXT: s_mov_b64 s[0:1], 0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v0, v1 -; GFX906-NEXT: .LBB12_1: ; %bb.1 -; GFX906-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX906-NEXT: s_and_b64 s[6:7], exec, vcc -; GFX906-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] -; GFX906-NEXT: v_perm_b32 v0, v1, v0, s2 -; GFX906-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX906-NEXT: s_cbranch_execnz .LBB12_1 -; GFX906-NEXT: ; %bb.2: ; %bb.2.loopexit -; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 -; GFX906-NEXT: v_mov_b32_e32 v1, 0 -; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_store_dword v1, v0, s[0:1] -; GFX906-NEXT: s_endpgm +; GFX942-LABEL: v32i8_loop_carried: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_and_b32_e32 v1, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 5, v1 +; GFX942-NEXT: v_cmp_lt_u32_e32 vcc, 14, v1 +; GFX942-NEXT: s_mov_b32 s2, 0x2000604 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_load_dword v0, v0, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, v0 +; GFX942-NEXT: .LBB12_1: ; %bb.1 +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_and_b64 s[6:7], exec, vcc +; GFX942-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] +; GFX942-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB12_1 +; GFX942-NEXT: ; %bb.2: ; %bb.2.loopexit +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_store_dword v0, v1, s[0:1] +; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -724,91 +712,92 @@ bb.2: ; Should not have instances of "Instruction does not dominate all uses!" define amdgpu_kernel void @v8i8_multiuse_multiblock(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst1, ptr addrspace(1) nocapture %dst2, ptr addrspace(1) nocapture %dst3) { -; GFX906-LABEL: v8i8_multiuse_multiblock: -; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX906-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 -; GFX906-NEXT: v_cmp_lt_u32_e64 s[2:3], 14, v0 -; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 -; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[1:2], v1, s[8:9] -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX906-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX906-NEXT: s_cbranch_execz .LBB13_2 -; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: s_movk_i32 s6, 0xff00 -; GFX906-NEXT: v_mov_b32_e32 v5, 8 -; GFX906-NEXT: v_and_b32_sdwa v6, v1, s6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX906-NEXT: s_mov_b32 s6, 0x6070504 -; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 -; GFX906-NEXT: v_and_b32_e32 v4, 0xffffff00, v1 -; GFX906-NEXT: v_lshlrev_b16_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX906-NEXT: v_perm_b32 v7, v1, v1, s6 -; GFX906-NEXT: s_andn2_b64 s[2:3], s[2:3], exec -; GFX906-NEXT: s_and_b64 s[6:7], vcc, exec -; GFX906-NEXT: v_mov_b32_e32 v3, 0 -; GFX906-NEXT: v_or_b32_sdwa v4, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v6, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX906-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] -; GFX906-NEXT: v_or_b32_sdwa v6, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dword v3, v1, s[12:13] -; GFX906-NEXT: global_store_dword v3, v7, s[12:13] offset:8 -; GFX906-NEXT: global_store_dword v3, v6, s[12:13] offset:16 -; GFX906-NEXT: global_store_dword v3, v4, s[12:13] offset:24 -; GFX906-NEXT: .LBB13_2: ; %Flow -; GFX906-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX906-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] -; GFX906-NEXT: s_cbranch_execz .LBB13_4 -; GFX906-NEXT: ; %bb.3: ; %bb.2 -; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v2 -; GFX906-NEXT: v_and_b32_e32 v4, 0xffffff00, v2 -; GFX906-NEXT: v_and_b32_e32 v5, 0xffffff00, v1 -; GFX906-NEXT: s_mov_b32 s2, 0xc0c0001 -; GFX906-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_perm_b32 v2, 0, v2, s2 -; GFX906-NEXT: v_mov_b32_e32 v0, 0 -; GFX906-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_perm_b32 v6, 0, v1, s2 -; GFX906-NEXT: s_mov_b32 s3, 0xffff0000 -; GFX906-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX906-NEXT: v_and_or_b32 v7, v1, s3, v6 -; GFX906-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX906-NEXT: global_store_dword v0, v3, s[14:15] -; GFX906-NEXT: global_store_dword v0, v4, s[14:15] offset:8 -; GFX906-NEXT: global_store_dword v0, v7, s[14:15] offset:16 -; GFX906-NEXT: global_store_dword v0, v2, s[14:15] offset:24 -; GFX906-NEXT: .LBB13_4: ; %bb.3 -; GFX906-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX906-NEXT: s_movk_i32 s3, 0xff00 -; GFX906-NEXT: v_mov_b32_e32 v4, 8 -; GFX906-NEXT: s_movk_i32 s2, 0xff -; GFX906-NEXT: v_and_b32_sdwa v2, v1, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX906-NEXT: v_or_b32_sdwa v3, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v5, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v6, 8, v1 -; GFX906-NEXT: v_and_b32_sdwa v7, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX906-NEXT: v_mov_b32_e32 v0, 0 -; GFX906-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v7, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX906-NEXT: global_store_dword v0, v3, s[0:1] -; GFX906-NEXT: global_store_dword v0, v1, s[0:1] offset:8 -; GFX906-NEXT: global_store_dword v0, v4, s[0:1] offset:16 -; GFX906-NEXT: global_store_dword v0, v2, s[0:1] offset:24 -; GFX906-NEXT: s_endpgm +; GFX942-LABEL: v8i8_multiuse_multiblock: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v2 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 +; GFX942-NEXT: v_cmp_lt_u32_e64 s[2:3], 14, v2 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_load_dwordx2 v[0:1], v0, s[8:9] +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v2 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX942-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX942-NEXT: s_cbranch_execz .LBB13_2 +; GFX942-NEXT: ; %bb.1: ; %bb.1 +; GFX942-NEXT: s_movk_i32 s6, 0xff00 +; GFX942-NEXT: v_mov_b32_e32 v5, 8 +; GFX942-NEXT: v_and_b32_sdwa v6, v0, s6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: s_mov_b32 s6, 0x6070504 +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v2 +; GFX942-NEXT: v_and_b32_e32 v4, 0xffffff00, v0 +; GFX942-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_perm_b32 v7, v0, v0, s6 +; GFX942-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; GFX942-NEXT: s_and_b64 s[6:7], vcc, exec +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_or_b32_sdwa v4, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX942-NEXT: v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-NEXT: v_or_b32_sdwa v6, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX942-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] +; GFX942-NEXT: v_or_b32_sdwa v6, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX942-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX942-NEXT: global_store_dword v3, v0, s[12:13] +; GFX942-NEXT: global_store_dword v3, v7, s[12:13] offset:8 +; GFX942-NEXT: global_store_dword v3, v6, s[12:13] offset:16 +; GFX942-NEXT: global_store_dword v3, v4, s[12:13] offset:24 +; GFX942-NEXT: .LBB13_2: ; %Flow +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; GFX942-NEXT: s_cbranch_execz .LBB13_4 +; GFX942-NEXT: ; %bb.3: ; %bb.2 +; GFX942-NEXT: v_lshlrev_b16_e32 v3, 8, v1 +; GFX942-NEXT: v_and_b32_e32 v4, 0xffffff00, v1 +; GFX942-NEXT: v_and_b32_e32 v5, 0xffffff00, v0 +; GFX942-NEXT: s_mov_b32 s2, 0xc0c0001 +; GFX942-NEXT: v_or_b32_sdwa v3, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX942-NEXT: v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-NEXT: v_or_b32_sdwa v5, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-NEXT: v_perm_b32 v1, 0, v1, s2 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX942-NEXT: v_perm_b32 v6, 0, v0, s2 +; GFX942-NEXT: s_mov_b32 s3, 0xffff0000 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX942-NEXT: v_and_or_b32 v7, v0, s3, v6 +; GFX942-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX942-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX942-NEXT: global_store_dword v2, v3, s[14:15] +; GFX942-NEXT: global_store_dword v2, v4, s[14:15] offset:8 +; GFX942-NEXT: global_store_dword v2, v7, s[14:15] offset:16 +; GFX942-NEXT: global_store_dword v2, v1, s[14:15] offset:24 +; GFX942-NEXT: .LBB13_4: ; %bb.3 +; GFX942-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX942-NEXT: s_movk_i32 s3, 0xff00 +; GFX942-NEXT: v_mov_b32_e32 v4, 8 +; GFX942-NEXT: s_movk_i32 s2, 0xff +; GFX942-NEXT: v_and_b32_sdwa v2, v0, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_or_b32_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-NEXT: v_or_b32_sdwa v5, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX942-NEXT: v_lshlrev_b16_e32 v6, 8, v0 +; GFX942-NEXT: v_and_b32_sdwa v7, v0, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX942-NEXT: v_or_b32_sdwa v7, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX942-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX942-NEXT: v_or_b32_sdwa v4, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX942-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX942-NEXT: v_or_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX942-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX942-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX942-NEXT: global_store_dword v1, v3, s[0:1] +; GFX942-NEXT: global_store_dword v1, v0, s[0:1] offset:8 +; GFX942-NEXT: global_store_dword v1, v4, s[0:1] offset:16 +; GFX942-NEXT: global_store_dword v1, v2, s[0:1] offset:24 +; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx @@ -865,4 +854,181 @@ bb.3: } +define amdgpu_kernel void @v8i8_mfma_i8(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst, ptr addrspace(1) %arg) { +; GFX942-LABEL: v8i8_mfma_i8: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[8:9] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_cbranch_execz .LBB14_2 +; GFX942-NEXT: ; %bb.1: ; %bb.1 +; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[10:11] +; GFX942-NEXT: .LBB14_2: ; %bb.2 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[14:15], 0x0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: s_nop 6 +; GFX942-NEXT: global_store_dwordx4 v2, a[0:3], s[12:13] +; GFX942-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <8 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 + %cmp = icmp ult i32 %idx, 15 + br i1 %cmp, label %bb.1, label %bb.2 +bb.1: + br label %bb.2 + +bb.2: + %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] + %mfmaop = bitcast <8 x i8> %tmp5 to i64 + %in.1 = load <4 x i32>, ptr addrspace(1) %arg + %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 %mfmaop, i64 %mfmaop, <4 x i32> %in.1, i32 1, i32 2, i32 3) + store <4 x i32> %mai.1, ptr addrspace(1) %dst, align 4 + ret void +} + +; Demonstrates that even if the intrinsic is not an 8 bit intrinsic, we will still apply type coercion + +define amdgpu_kernel void @v8i8_mfma_half(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst, ptr addrspace(1) %arg) { +; GFX942-LABEL: v8i8_mfma_half: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dwordx8 s[36:43], s[4:5], 0x24 +; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[36:37] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_cbranch_execz .LBB15_2 +; GFX942-NEXT: ; %bb.1: ; %bb.1 +; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[38:39] +; GFX942-NEXT: .LBB15_2: ; %bb.2 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_load_dwordx16 s[16:31], s[42:43], 0x0 +; GFX942-NEXT: s_load_dwordx16 s[0:15], s[42:43], 0x40 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_accvgpr_write_b32 a0, s16 +; GFX942-NEXT: v_accvgpr_write_b32 a1, s17 +; GFX942-NEXT: v_accvgpr_write_b32 a2, s18 +; GFX942-NEXT: v_accvgpr_write_b32 a3, s19 +; GFX942-NEXT: v_accvgpr_write_b32 a4, s20 +; GFX942-NEXT: v_accvgpr_write_b32 a5, s21 +; GFX942-NEXT: v_accvgpr_write_b32 a6, s22 +; GFX942-NEXT: v_accvgpr_write_b32 a7, s23 +; GFX942-NEXT: v_accvgpr_write_b32 a8, s24 +; GFX942-NEXT: v_accvgpr_write_b32 a9, s25 +; GFX942-NEXT: v_accvgpr_write_b32 a10, s26 +; GFX942-NEXT: v_accvgpr_write_b32 a11, s27 +; GFX942-NEXT: v_accvgpr_write_b32 a12, s28 +; GFX942-NEXT: v_accvgpr_write_b32 a13, s29 +; GFX942-NEXT: v_accvgpr_write_b32 a14, s30 +; GFX942-NEXT: v_accvgpr_write_b32 a15, s31 +; GFX942-NEXT: v_accvgpr_write_b32 a16, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a17, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a18, s2 +; GFX942-NEXT: v_accvgpr_write_b32 a19, s3 +; GFX942-NEXT: v_accvgpr_write_b32 a20, s4 +; GFX942-NEXT: v_accvgpr_write_b32 a21, s5 +; GFX942-NEXT: v_accvgpr_write_b32 a22, s6 +; GFX942-NEXT: v_accvgpr_write_b32 a23, s7 +; GFX942-NEXT: v_accvgpr_write_b32 a24, s8 +; GFX942-NEXT: v_accvgpr_write_b32 a25, s9 +; GFX942-NEXT: v_accvgpr_write_b32 a26, s10 +; GFX942-NEXT: v_accvgpr_write_b32 a27, s11 +; GFX942-NEXT: v_accvgpr_write_b32 a28, s12 +; GFX942-NEXT: v_accvgpr_write_b32 a29, s13 +; GFX942-NEXT: v_accvgpr_write_b32 a30, s14 +; GFX942-NEXT: v_accvgpr_write_b32 a31, s15 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[0:1], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 2 +; GFX942-NEXT: global_store_dwordx4 v2, a[28:31], s[40:41] offset:112 +; GFX942-NEXT: global_store_dwordx4 v2, a[24:27], s[40:41] offset:96 +; GFX942-NEXT: global_store_dwordx4 v2, a[20:23], s[40:41] offset:80 +; GFX942-NEXT: global_store_dwordx4 v2, a[16:19], s[40:41] offset:64 +; GFX942-NEXT: global_store_dwordx4 v2, a[12:15], s[40:41] offset:48 +; GFX942-NEXT: global_store_dwordx4 v2, a[8:11], s[40:41] offset:32 +; GFX942-NEXT: global_store_dwordx4 v2, a[4:7], s[40:41] offset:16 +; GFX942-NEXT: global_store_dwordx4 v2, a[0:3], s[40:41] +; GFX942-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <8 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 + %cmp = icmp ult i32 %idx, 15 + br i1 %cmp, label %bb.1, label %bb.2 +bb.1: + br label %bb.2 + +bb.2: + %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] + %mfmaop = bitcast <8 x i8> %tmp5 to <4 x half> + %in.1 = load <32 x float>, ptr addrspace(1) %arg + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %mfmaop, <4 x half> %mfmaop, <32 x float> %in.1, i32 1, i32 2, i32 3) + store <32 x float> %mai.1, ptr addrspace(1) %dst, align 4 + ret void +} + + +define amdgpu_kernel void @v8i8_intrinsic(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { +; GFX942-LABEL: v8i8_intrinsic: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v4 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4 +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX942-NEXT: s_cbranch_execz .LBB16_2 +; GFX942-NEXT: ; %bb.1: ; %bb.1 +; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] +; GFX942-NEXT: .LBB16_2: ; %bb.2 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[0:1], v[0:1] +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX942-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <8 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 + %cmp = icmp ult i32 %idx, 15 + br i1 %cmp, label %bb.1, label %bb.2 +bb.1: + br label %bb.2 + +bb.2: + %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] + %op = bitcast <8 x i8> %tmp5 to <2 x float> + %result = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %op, <2 x float> %op, <2 x float> %op) + store <2 x float> %result, ptr addrspace(1) %dst, align 8 + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x()