diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index d80a6f339c8f6..a6c1af24e13e9 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1823,6 +1823,16 @@ void SIRegisterInfo::buildSpillLoadStore( } } + Register FinalValueReg = ValueReg; + if (LoadStoreOp == AMDGPU::SCRATCH_LOAD_USHORT_SADDR) { + // If we are loading 16-bit value with SRAMECC endabled we need a temp + // 32-bit VGPR to load and extract 16-bits into the final register. + ValueReg = + RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0); + SubReg = ValueReg; + IsKill = false; + } + MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset); MachineMemOperand *NewMMO = MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize, @@ -1863,6 +1873,17 @@ void SIRegisterInfo::buildSpillLoadStore( MIB.addImm(0); // swz MIB.addMemOperand(NewMMO); + if (FinalValueReg != ValueReg) { + // Extract 16-bit from the loaded 32-bit value. + ValueReg = getSubReg(ValueReg, AMDGPU::lo16); + MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B16_t16_e64)) + .addReg(FinalValueReg, getDefRegState(true)) + .addImm(0) + .addReg(ValueReg, getKillRegState(true)) + .addImm(0); + ValueReg = FinalValueReg; + } + if (!IsAGPR && NeedSuperRegDef) MIB.addReg(ValueReg, RegState::ImplicitDefine); @@ -2505,7 +2526,9 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, unsigned Opc; if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_RESTORE) { assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!"); - Opc = AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16; + Opc = ST.d16PreservesUnusedBits() + ? AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16 + : AMDGPU::SCRATCH_LOAD_USHORT_SADDR; } else { Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE ? AMDGPU::SCRATCH_LOAD_BLOCK_SADDR diff --git a/llvm/test/CodeGen/AMDGPU/spill_kill_v16.mir b/llvm/test/CodeGen/AMDGPU/spill_kill_v16.mir index 0c694d9f49e18..69895833efccb 100644 --- a/llvm/test/CodeGen/AMDGPU/spill_kill_v16.mir +++ b/llvm/test/CodeGen/AMDGPU/spill_kill_v16.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -verify-machineinstrs -mcpu=gfx1100 -mattr=+real-true16 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=EXPANDED %s +# RUN: llc -march=amdgcn -verify-machineinstrs -mcpu=gfx1250 -mattr=+real-true16 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=SRAMECC-EXPANDED %s --- name: spill_restore_vgpr16 @@ -31,6 +32,28 @@ body: | ; EXPANDED-NEXT: $vgpr0_lo16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, align 4, addrspace 5) ; EXPANDED-NEXT: $vgpr0_hi16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, align 4, addrspace 5) ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16 + ; + ; SRAMECC-EXPANDED-LABEL: name: spill_restore_vgpr16 + ; SRAMECC-EXPANDED: bb.0: + ; SRAMECC-EXPANDED-NEXT: successors: %bb.1(0x80000000) + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16 + ; SRAMECC-EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_hi16, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.1, align 4, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit renamable $vgpr0_lo16 + ; SRAMECC-EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_lo16, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.0, align 4, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: bb.1: + ; SRAMECC-EXPANDED-NEXT: successors: %bb.2(0x80000000) + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: S_NOP 1 + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: bb.2: + ; SRAMECC-EXPANDED-NEXT: $vgpr1 = SCRATCH_LOAD_USHORT_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, align 4, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e64 0, killed $vgpr1_lo16, 0, implicit $exec + ; SRAMECC-EXPANDED-NEXT: $vgpr1 = SCRATCH_LOAD_USHORT_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, align 4, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e64 0, killed $vgpr1_lo16, 0, implicit $exec + ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16 bb.0: successors: %bb.1(0x80000000) S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16 @@ -78,6 +101,29 @@ body: | ; EXPANDED-NEXT: $vgpr0_lo16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, align 4, addrspace 5) ; EXPANDED-NEXT: $vgpr0_hi16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, align 4, addrspace 5) ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16 + ; + ; SRAMECC-EXPANDED-LABEL: name: spill_restore_vgpr16_middle_of_block + ; SRAMECC-EXPANDED: bb.0: + ; SRAMECC-EXPANDED-NEXT: successors: %bb.1(0x80000000) + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16 + ; SRAMECC-EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_hi16, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.1, align 4, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit renamable $vgpr0_lo16 + ; SRAMECC-EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_lo16, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.0, align 4, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: bb.1: + ; SRAMECC-EXPANDED-NEXT: successors: %bb.2(0x80000000) + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: S_NOP 1 + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: bb.2: + ; SRAMECC-EXPANDED-NEXT: S_NOP 1 + ; SRAMECC-EXPANDED-NEXT: $vgpr1 = SCRATCH_LOAD_USHORT_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, align 4, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e64 0, killed $vgpr1_lo16, 0, implicit $exec + ; SRAMECC-EXPANDED-NEXT: $vgpr1 = SCRATCH_LOAD_USHORT_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, align 4, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e64 0, killed $vgpr1_lo16, 0, implicit $exec + ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16 bb.0: successors: %bb.1(0x80000000) S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16 @@ -124,6 +170,27 @@ body: | ; EXPANDED-NEXT: bb.2: ; EXPANDED-NEXT: $vgpr0_lo16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, align 4, addrspace 5) ; EXPANDED-NEXT: $vgpr0_hi16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, align 4, addrspace 5) + ; + ; SRAMECC-EXPANDED-LABEL: name: spill_restore_vgpr16_end_of_block + ; SRAMECC-EXPANDED: bb.0: + ; SRAMECC-EXPANDED-NEXT: successors: %bb.1(0x80000000) + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16 + ; SRAMECC-EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_hi16, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.1, align 4, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit renamable $vgpr0_lo16 + ; SRAMECC-EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_lo16, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.0, align 4, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: bb.1: + ; SRAMECC-EXPANDED-NEXT: successors: %bb.2(0x80000000) + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: S_NOP 1 + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: bb.2: + ; SRAMECC-EXPANDED-NEXT: $vgpr1 = SCRATCH_LOAD_USHORT_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, align 4, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e64 0, killed $vgpr1_lo16, 0, implicit $exec + ; SRAMECC-EXPANDED-NEXT: $vgpr1 = SCRATCH_LOAD_USHORT_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, align 4, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e64 0, killed $vgpr1_lo16, 0, implicit $exec bb.0: successors: %bb.1(0x80000000) S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16 diff --git a/llvm/test/CodeGen/AMDGPU/spillv16.ll b/llvm/test/CodeGen/AMDGPU/spillv16.ll index 0e45df223465d..2d54ac8283a3a 100644 --- a/llvm/test/CodeGen/AMDGPU/spillv16.ll +++ b/llvm/test/CodeGen/AMDGPU/spillv16.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,GCN-TRUE16 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,GCN-FAKE16 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX1250,GFX1250-TRUE16 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX1250,GFX1250-FAKE16 define void @spill_i16_alu() { ; GCN-TRUE16-LABEL: spill_i16_alu: @@ -32,6 +34,41 @@ define void @spill_i16_alu() { ; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-TRUE16-LABEL: spill_i16_alu: +; GFX1250-TRUE16: ; %bb.0: ; %entry +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: ;;#ASMSTART +; GFX1250-TRUE16-NEXT: ;;#ASMEND +; GFX1250-TRUE16-NEXT: scratch_load_u16 v1, off, s32 offset:2 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: spill_i16_alu: +; GFX1250-FAKE16: ; %bb.0: ; %entry +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_add_nc_u16 v0, 0x7b, v0 +; GFX1250-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: ;;#ASMSTART +; GFX1250-FAKE16-NEXT: ;;#ASMEND +; GFX1250-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] entry: %alloca = alloca i16, i32 1, align 4, addrspace(5) @@ -88,6 +125,51 @@ define void @spill_i16_alu_two_vals() { ; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 dlc ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-TRUE16-LABEL: spill_i16_alu_two_vals: +; GFX1250-TRUE16: ; %bb.0: ; %entry +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:6 ; 2-byte Folded Spill +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: ;;#ASMSTART +; GFX1250-TRUE16-NEXT: ;;#ASMEND +; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 offset:4 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_load_u16 v1, off, s32 offset:6 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX1250-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX1250-TRUE16-NEXT: scratch_store_d16_hi_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: spill_i16_alu_two_vals: +; GFX1250-FAKE16: ; %bb.0: ; %entry +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: v_add_nc_u16 v0, 0x7b, v0 +; GFX1250-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: ;;#ASMSTART +; GFX1250-FAKE16-NEXT: ;;#ASMEND +; GFX1250-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:4 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_load_b32 v1, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-FAKE16-NEXT: v_add_nc_u16 v0, 0x7b, v0 +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v1, s32 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] entry: %alloca = alloca i16, i32 1, align 4, addrspace(5) %alloca2 = alloca i16, i32 1, align 4, addrspace(5) @@ -140,6 +222,22 @@ define void @spill_i16() { ; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: spill_i16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: ;;#ASMSTART +; GFX1250-NEXT: ;;#ASMEND +; GFX1250-NEXT: scratch_load_b32 v0, off, s32 offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %alloca = alloca i16, i32 1, align 4, addrspace(5) @@ -183,6 +281,22 @@ define void @spill_half() { ; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: spill_half: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: ;;#ASMSTART +; GFX1250-NEXT: ;;#ASMEND +; GFX1250-NEXT: scratch_load_b32 v0, off, s32 offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %alloca = alloca half, i32 1, align 4, addrspace(5) @@ -226,6 +340,22 @@ define void @spill_i16_from_v2i16() { ; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: spill_i16_from_v2i16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_u16 v0, off, s32 offset:2 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: ;;#ASMSTART +; GFX1250-NEXT: ;;#ASMEND +; GFX1250-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %alloca = alloca <2 x i16>, i32 2, align 1, addrspace(5) @@ -283,6 +413,54 @@ define void @spill_2xi16_from_v2i16() { ; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-TRUE16-LABEL: spill_2xi16_from_v2i16: +; GFX1250-TRUE16: ; %bb.0: ; %entry +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 offset:2 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: s_clause 0x1 +; GFX1250-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:12 +; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: ;;#ASMSTART +; GFX1250-TRUE16-NEXT: ;;#ASMEND +; GFX1250-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: spill_2xi16_from_v2i16: +; GFX1250-FAKE16: ; %bb.0: ; %entry +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:2 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: s_clause 0x1 +; GFX1250-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 +; GFX1250-FAKE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:12 ; 4-byte Folded Spill +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: ;;#ASMSTART +; GFX1250-FAKE16-NEXT: ;;#ASMEND +; GFX1250-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] entry: %alloca = alloca <2 x i16>, i32 2, align 1, addrspace(5) @@ -341,6 +519,47 @@ define void @spill_2xi16_from_v2i16_one_free_reg() { ; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-TRUE16-LABEL: spill_2xi16_from_v2i16_one_free_reg: +; GFX1250-TRUE16: ; %bb.0: ; %entry +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_load_u16 v7, off, s32 offset:2 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: ;;#ASMSTART +; GFX1250-TRUE16-NEXT: ;;#ASMEND +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.l +; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: spill_2xi16_from_v2i16_one_free_reg: +; GFX1250-FAKE16: ; %bb.0: ; %entry +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_load_u16 v7, off, s32 offset:2 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: ;;#ASMSTART +; GFX1250-FAKE16-NEXT: ;;#ASMEND +; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v7, s32 offset:2 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] entry: %alloca = alloca <2 x i16>, i32 2, align 1, addrspace(5) @@ -375,6 +594,22 @@ define void @spill_v2i16() { ; GCN-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc ; GCN-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: spill_v2i16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_b32 v0, off, s32 offset:4 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: ;;#ASMSTART +; GFX1250-NEXT: ;;#ASMEND +; GFX1250-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v0, s32 offset:4 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %alloca = alloca <2 x i16>, i32 2, align 1, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/spillv16.mir b/llvm/test/CodeGen/AMDGPU/spillv16.mir index 05569bf394c43..ba2d926eb8883 100644 --- a/llvm/test/CodeGen/AMDGPU/spillv16.mir +++ b/llvm/test/CodeGen/AMDGPU/spillv16.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -verify-machineinstrs -mcpu=gfx1100 -mattr=+real-true16 -run-pass=regallocfast -o - %s | FileCheck -check-prefix=SPILLED %s # RUN: llc -march=amdgcn -verify-machineinstrs -mcpu=gfx1100 -mattr=+real-true16 -run-pass=regallocfast,prologepilog -o - %s | FileCheck -check-prefix=EXPANDED %s +# RUN: llc -march=amdgcn -verify-machineinstrs -mcpu=gfx1250 -mattr=+real-true16 -run-pass=regallocfast,prologepilog -o - %s | FileCheck -check-prefix=SRAMECC-EXPANDED %s --- name: spill_restore_vgpr16 @@ -46,6 +47,27 @@ body: | ; EXPANDED-NEXT: $vgpr0_lo16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, addrspace 5) ; EXPANDED-NEXT: $vgpr0_hi16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 2, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, addrspace 5) ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16 + ; + ; SRAMECC-EXPANDED-LABEL: name: spill_restore_vgpr16 + ; SRAMECC-EXPANDED: bb.0: + ; SRAMECC-EXPANDED-NEXT: successors: %bb.1(0x80000000) + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16 + ; SRAMECC-EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_hi16, $sgpr32, 2, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.1, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_lo16, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.0, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: bb.1: + ; SRAMECC-EXPANDED-NEXT: successors: %bb.2(0x80000000) + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: S_NOP 1 + ; SRAMECC-EXPANDED-NEXT: {{ $}} + ; SRAMECC-EXPANDED-NEXT: bb.2: + ; SRAMECC-EXPANDED-NEXT: $vgpr1 = SCRATCH_LOAD_USHORT_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e64 0, killed $vgpr1_lo16, 0, implicit $exec + ; SRAMECC-EXPANDED-NEXT: $vgpr1 = SCRATCH_LOAD_USHORT_SADDR $sgpr32, 2, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, addrspace 5) + ; SRAMECC-EXPANDED-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e64 0, killed $vgpr1_lo16, 0, implicit $exec + ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16 bb.0: S_NOP 0, implicit-def %0:vgpr_16, implicit-def %1:vgpr_16 S_CBRANCH_SCC1 implicit undef $scc, %bb.1