diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 54d94b1f8682e..84121138a40df 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1614,7 +1614,8 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12", FeatureDefaultComponentBroadcast, FeatureMaxHardClauseLength32, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, FeatureIEEEMinimumMaximumInsts, FeatureMinimum3Maximum3F32, - FeatureMinimum3Maximum3F16, FeatureAgentScopeFineGrainedRemoteMemoryAtomics + FeatureMinimum3Maximum3F16, FeatureAgentScopeFineGrainedRemoteMemoryAtomics, + FeatureD16Writes32BitVgpr ] >; diff --git a/llvm/test/CodeGen/AMDGPU/spillv16.ll b/llvm/test/CodeGen/AMDGPU/spillv16.ll index 2d54ac8283a3a..60c1490dcbe71 100644 --- a/llvm/test/CodeGen/AMDGPU/spillv16.ll +++ b/llvm/test/CodeGen/AMDGPU/spillv16.ll @@ -1,39 +1,61 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,GCN-TRUE16 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,GCN-FAKE16 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX11-TRUE16 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX11-FAKE16 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX12-TRUE16,GFX12-TRUE16-D16W16 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16,+d16-write-vgpr32 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX12-TRUE16,GFX12-TRUE16-D16W32 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX1250,GFX1250-TRUE16 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX1250,GFX1250-FAKE16 define void @spill_i16_alu() { -; GCN-TRUE16-LABEL: spill_i16_alu: -; GCN-TRUE16: ; %bb.0: ; %entry -; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc -; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GCN-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l -; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill -; GCN-TRUE16-NEXT: ;;#ASMSTART -; GCN-TRUE16-NEXT: ;;#ASMEND -; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 ; 2-byte Folded Reload -; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc -; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: spill_i16_alu: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill +; GFX11-TRUE16-NEXT: ;;#ASMSTART +; GFX11-TRUE16-NEXT: ;;#ASMEND +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 ; 2-byte Folded Reload +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GCN-FAKE16-LABEL: spill_i16_alu: -; GCN-FAKE16: ; %bb.0: ; %entry -; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc -; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GCN-FAKE16-NEXT: v_add_nc_u16 v0, 0x7b, v0 -; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill -; GCN-FAKE16-NEXT: ;;#ASMSTART -; GCN-FAKE16-NEXT: ;;#ASMEND -; GCN-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 ; 4-byte Folded Reload -; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc -; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-LABEL: spill_i16_alu: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x7b, v0 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 ; 4-byte Folded Reload +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-TRUE16-LABEL: spill_i16_alu: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill +; GFX12-TRUE16-NEXT: ;;#ASMSTART +; GFX12-TRUE16-NEXT: ;;#ASMEND +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-TRUE16-LABEL: spill_i16_alu: ; GFX1250-TRUE16: ; %bb.0: ; %entry @@ -86,45 +108,95 @@ entry: } define void @spill_i16_alu_two_vals() { -; GCN-TRUE16-LABEL: spill_i16_alu_two_vals: -; GCN-TRUE16: ; %bb.0: ; %entry -; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc -; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GCN-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l -; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:6 ; 2-byte Folded Spill -; GCN-TRUE16-NEXT: ;;#ASMSTART -; GCN-TRUE16-NEXT: ;;#ASMEND -; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:4 glc dlc -; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GCN-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, off, s32 offset:6 ; 2-byte Folded Reload -; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GCN-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l -; GCN-TRUE16-NEXT: scratch_store_d16_hi_b16 off, v0, s32 dlc -; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 dlc -; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: spill_i16_alu_two_vals: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:6 ; 2-byte Folded Spill +; GFX11-TRUE16-NEXT: ;;#ASMSTART +; GFX11-TRUE16-NEXT: ;;#ASMEND +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:4 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, off, s32 offset:6 ; 2-byte Folded Reload +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX11-TRUE16-NEXT: scratch_store_d16_hi_b16 off, v0, s32 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: spill_i16_alu_two_vals: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x7b, v0 +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:4 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: scratch_load_b32 v1, off, s32 offset:8 ; 4-byte Folded Reload +; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, 0x7b, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: scratch_store_b16 off, v1, s32 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-TRUE16-D16W16-LABEL: spill_i16_alu_two_vals: +; GFX12-TRUE16-D16W16: ; %bb.0: ; %entry +; GFX12-TRUE16-D16W16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS +; GFX12-TRUE16-D16W16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX12-TRUE16-D16W16-NEXT: scratch_store_b16 off, v0, s32 offset:6 ; 2-byte Folded Spill +; GFX12-TRUE16-D16W16-NEXT: ;;#ASMSTART +; GFX12-TRUE16-D16W16-NEXT: ;;#ASMEND +; GFX12-TRUE16-D16W16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:4 scope:SCOPE_SYS +; GFX12-TRUE16-D16W16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: scratch_load_d16_hi_b16 v0, off, s32 offset:6 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-D16W16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX12-TRUE16-D16W16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: scratch_store_d16_hi_b16 off, v0, s32 scope:SCOPE_SYS +; GFX12-TRUE16-D16W16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: scratch_store_b16 off, v0, s32 offset:4 scope:SCOPE_SYS +; GFX12-TRUE16-D16W16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: s_setpc_b64 s[30:31] ; -; GCN-FAKE16-LABEL: spill_i16_alu_two_vals: -; GCN-FAKE16: ; %bb.0: ; %entry -; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc -; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GCN-FAKE16-NEXT: v_add_nc_u16 v0, 0x7b, v0 -; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill -; GCN-FAKE16-NEXT: ;;#ASMSTART -; GCN-FAKE16-NEXT: ;;#ASMEND -; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:4 glc dlc -; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GCN-FAKE16-NEXT: scratch_load_b32 v1, off, s32 offset:8 ; 4-byte Folded Reload -; GCN-FAKE16-NEXT: v_add_nc_u16 v0, 0x7b, v0 -; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GCN-FAKE16-NEXT: scratch_store_b16 off, v1, s32 dlc -; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 dlc -; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-D16W32-LABEL: spill_i16_alu_two_vals: +; GFX12-TRUE16-D16W32: ; %bb.0: ; %entry +; GFX12-TRUE16-D16W32-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS +; GFX12-TRUE16-D16W32-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX12-TRUE16-D16W32-NEXT: scratch_store_b16 off, v0, s32 offset:6 ; 2-byte Folded Spill +; GFX12-TRUE16-D16W32-NEXT: ;;#ASMSTART +; GFX12-TRUE16-D16W32-NEXT: ;;#ASMEND +; GFX12-TRUE16-D16W32-NEXT: scratch_load_d16_b16 v0, off, s32 offset:4 scope:SCOPE_SYS +; GFX12-TRUE16-D16W32-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: scratch_load_d16_hi_b16 v0, off, s32 offset:6 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-D16W32-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX12-TRUE16-D16W32-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: scratch_store_d16_hi_b16 off, v0, s32 scope:SCOPE_SYS +; GFX12-TRUE16-D16W32-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: scratch_store_b16 off, v0, s32 offset:4 scope:SCOPE_SYS +; GFX12-TRUE16-D16W32-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-TRUE16-LABEL: spill_i16_alu_two_vals: ; GFX1250-TRUE16: ; %bb.0: ; %entry @@ -195,33 +267,52 @@ entry: ; Tests after this do not actually test 16 bit spills because there is no use of VGPR_16. They could demonstrate 16 bit spills if we update the instructions to use VGPR_16 instead of VGPR_32 define void @spill_i16() { -; GCN-TRUE16-LABEL: spill_i16: -; GCN-TRUE16: ; %bb.0: ; %entry -; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc -; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill -; GCN-TRUE16-NEXT: ;;#ASMSTART -; GCN-TRUE16-NEXT: ;;#ASMEND -; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 ; 2-byte Folded Reload -; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc -; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: spill_i16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill +; GFX11-TRUE16-NEXT: ;;#ASMSTART +; GFX11-TRUE16-NEXT: ;;#ASMEND +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 ; 2-byte Folded Reload +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GCN-FAKE16-LABEL: spill_i16: -; GCN-FAKE16: ; %bb.0: ; %entry -; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc -; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill -; GCN-FAKE16-NEXT: ;;#ASMSTART -; GCN-FAKE16-NEXT: ;;#ASMEND -; GCN-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 ; 4-byte Folded Reload -; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc -; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-LABEL: spill_i16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 ; 4-byte Folded Reload +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-TRUE16-LABEL: spill_i16: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill +; GFX12-TRUE16-NEXT: ;;#ASMSTART +; GFX12-TRUE16-NEXT: ;;#ASMEND +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: spill_i16: ; GFX1250: ; %bb.0: ; %entry @@ -254,33 +345,52 @@ entry: } define void @spill_half() { -; GCN-TRUE16-LABEL: spill_half: -; GCN-TRUE16: ; %bb.0: ; %entry -; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc -; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill -; GCN-TRUE16-NEXT: ;;#ASMSTART -; GCN-TRUE16-NEXT: ;;#ASMEND -; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 ; 2-byte Folded Reload -; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc -; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: spill_half: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill +; GFX11-TRUE16-NEXT: ;;#ASMSTART +; GFX11-TRUE16-NEXT: ;;#ASMEND +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 ; 2-byte Folded Reload +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: spill_half: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 ; 4-byte Folded Reload +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GCN-FAKE16-LABEL: spill_half: -; GCN-FAKE16: ; %bb.0: ; %entry -; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc -; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill -; GCN-FAKE16-NEXT: ;;#ASMSTART -; GCN-FAKE16-NEXT: ;;#ASMEND -; GCN-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 ; 4-byte Folded Reload -; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc -; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: spill_half: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill +; GFX12-TRUE16-NEXT: ;;#ASMSTART +; GFX12-TRUE16-NEXT: ;;#ASMEND +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: spill_half: ; GFX1250: ; %bb.0: ; %entry @@ -313,33 +423,52 @@ entry: } define void @spill_i16_from_v2i16() { -; GCN-TRUE16-LABEL: spill_i16_from_v2i16: -; GCN-TRUE16: ; %bb.0: ; %entry -; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 glc dlc -; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill -; GCN-TRUE16-NEXT: ;;#ASMSTART -; GCN-TRUE16-NEXT: ;;#ASMEND -; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:8 ; 2-byte Folded Reload -; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc -; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: spill_i16_from_v2i16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill +; GFX11-TRUE16-NEXT: ;;#ASMSTART +; GFX11-TRUE16-NEXT: ;;#ASMEND +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:8 ; 2-byte Folded Reload +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; -; GCN-FAKE16-LABEL: spill_i16_from_v2i16: -; GCN-FAKE16: ; %bb.0: ; %entry -; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:2 glc dlc -; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill -; GCN-FAKE16-NEXT: ;;#ASMSTART -; GCN-FAKE16-NEXT: ;;#ASMEND -; GCN-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload -; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc -; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-LABEL: spill_i16_from_v2i16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:2 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-TRUE16-LABEL: spill_i16_from_v2i16: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill +; GFX12-TRUE16-NEXT: ;;#ASMSTART +; GFX12-TRUE16-NEXT: ;;#ASMEND +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:8 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: spill_i16_from_v2i16: ; GFX1250: ; %bb.0: ; %entry @@ -372,47 +501,73 @@ entry: } define void @spill_2xi16_from_v2i16() { -; GCN-TRUE16-LABEL: spill_2xi16_from_v2i16: -; GCN-TRUE16: ; %bb.0: ; %entry -; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 glc dlc -; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill -; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc -; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill -; GCN-TRUE16-NEXT: ;;#ASMSTART -; GCN-TRUE16-NEXT: ;;#ASMEND -; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:8 ; 2-byte Folded Reload -; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc -; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:10 ; 2-byte Folded Reload -; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc -; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: spill_2xi16_from_v2i16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill +; GFX11-TRUE16-NEXT: ;;#ASMSTART +; GFX11-TRUE16-NEXT: ;;#ASMEND +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:8 ; 2-byte Folded Reload +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:10 ; 2-byte Folded Reload +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: spill_2xi16_from_v2i16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:2 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:12 ; 4-byte Folded Spill +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:12 ; 4-byte Folded Reload +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GCN-FAKE16-LABEL: spill_2xi16_from_v2i16: -; GCN-FAKE16: ; %bb.0: ; %entry -; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:2 glc dlc -; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill -; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc -; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:12 ; 4-byte Folded Spill -; GCN-FAKE16-NEXT: ;;#ASMSTART -; GCN-FAKE16-NEXT: ;;#ASMEND -; GCN-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload -; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc -; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:12 ; 4-byte Folded Reload -; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc -; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: spill_2xi16_from_v2i16: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill +; GFX12-TRUE16-NEXT: ;;#ASMSTART +; GFX12-TRUE16-NEXT: ;;#ASMEND +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:8 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:10 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-TRUE16-LABEL: spill_2xi16_from_v2i16: ; GFX1250-TRUE16: ; %bb.0: ; %entry @@ -481,44 +636,70 @@ entry: } define void @spill_2xi16_from_v2i16_one_free_reg() { -; GCN-TRUE16-LABEL: spill_2xi16_from_v2i16_one_free_reg: -; GCN-TRUE16: ; %bb.0: ; %entry -; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 glc dlc -; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill -; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc -; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill -; GCN-TRUE16-NEXT: ;;#ASMSTART -; GCN-TRUE16-NEXT: ;;#ASMEND -; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:8 ; 2-byte Folded Reload -; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc -; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:10 ; 2-byte Folded Reload -; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc -; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: spill_2xi16_from_v2i16_one_free_reg: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill +; GFX11-TRUE16-NEXT: ;;#ASMSTART +; GFX11-TRUE16-NEXT: ;;#ASMEND +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:8 ; 2-byte Folded Reload +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:10 ; 2-byte Folded Reload +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: spill_2xi16_from_v2i16_one_free_reg: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: scratch_load_u16 v7, off, s32 offset:2 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: scratch_store_b16 off, v7, s32 offset:2 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GCN-FAKE16-LABEL: spill_2xi16_from_v2i16_one_free_reg: -; GCN-FAKE16: ; %bb.0: ; %entry -; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-FAKE16-NEXT: scratch_load_u16 v7, off, s32 offset:2 glc dlc -; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc -; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill -; GCN-FAKE16-NEXT: ;;#ASMSTART -; GCN-FAKE16-NEXT: ;;#ASMEND -; GCN-FAKE16-NEXT: scratch_store_b16 off, v7, s32 offset:2 dlc -; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload -; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc -; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX12-TRUE16-LABEL: spill_2xi16_from_v2i16_one_free_reg: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill +; GFX12-TRUE16-NEXT: ;;#ASMSTART +; GFX12-TRUE16-NEXT: ;;#ASMEND +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:8 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:10 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-TRUE16-LABEL: spill_2xi16_from_v2i16_one_free_reg: ; GFX1250-TRUE16: ; %bb.0: ; %entry @@ -581,19 +762,52 @@ entry: } define void @spill_v2i16() { -; GCN-LABEL: spill_v2i16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: scratch_load_b32 v0, off, s32 offset:4 glc dlc -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc -; GCN-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: spill_v2i16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX11-TRUE16-NEXT: ;;#ASMSTART +; GFX11-TRUE16-NEXT: ;;#ASMEND +; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: spill_v2i16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-TRUE16-LABEL: spill_v2i16: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX12-TRUE16-NEXT: ;;#ASMSTART +; GFX12-TRUE16-NEXT: ;;#ASMEND +; GFX12-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:4 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: spill_v2i16: ; GFX1250: ; %bb.0: ; %entry