From bfe416640220a8e3d184783497a5a591e04af811 Mon Sep 17 00:00:00 2001 From: Anshil Gandhi Date: Fri, 7 Nov 2025 12:24:43 -0500 Subject: [PATCH] [AMDGPU] Allow negative offsets in scratch insts This patch enables LocalStackSlotAlloca pass to reuse base registers in scratch spill/reload instructions. Furthermore, issue #155902 resolves as there the PEI no longer needs to scavenge an SGPR. --- llvm/lib/Target/AMDGPU/AMDGPU.td | 10 +- .../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll | 28 +-- .../flat-scratch-neg-offset-bug-155902.ll | 232 ++++++++++++++++++ llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll | 7 +- llvm/test/CodeGen/AMDGPU/flat-scratch.ll | 51 ++-- .../local-stack-alloc-block-sp-reference.ll | 27 +- 6 files changed, 284 insertions(+), 71 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/flat-scratch-neg-offset-bug-155902.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 5dea64844e64e..e25b49f5abc8b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -334,12 +334,6 @@ def FeatureFlatSegmentOffsetBug : SubtargetFeature<"flat-segment-offset-bug", "GFX10 bug where inst_offset is ignored when flat instructions access global memory" >; -def FeatureNegativeScratchOffsetBug : SubtargetFeature<"negative-scratch-offset-bug", - "NegativeScratchOffsetBug", - "true", - "Negative immediate offsets in scratch instructions with an SGPR offset page fault on GFX9" ->; - def FeatureNegativeUnalignedScratchOffsetBug : SubtargetFeature<"negative-unaligned-scratch-offset-bug", "NegativeUnalignedScratchOffsetBug", "true", @@ -1588,8 +1582,8 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16, FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK, FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess, - FeatureUnalignedDSAccess, FeatureNegativeScratchOffsetBug, FeatureGWS, - FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad, + FeatureUnalignedDSAccess, FeatureGWS, FeatureDefaultComponentZero, + FeatureVmemWriteVgprInOrder, FeatureMemToLDSLoad, FeatureCubeInsts, FeatureLerpInst, FeatureSadInsts, FeatureQsadInsts, FeatureCvtNormInsts, FeatureCvtPkNormVOP2Insts, FeatureCvtPkNormVOP3Insts diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index 13d6597bd78f1..8831f306637fe 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -3947,9 +3947,9 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(pt ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5 ; GFX9-NEXT: v_add_u32_e32 v0, s3, v0 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-NEXT: v_add3_u32 v0, s2, v0, -16 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 15 -; GFX9-NEXT: scratch_store_dword v0, v1, off +; GFX9-NEXT: scratch_store_dword v0, v1, off offset:-16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; @@ -3969,9 +3969,9 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(pt ; GFX942-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: v_add_u32_e32 v0, s1, v0 -; GFX942-NEXT: v_add3_u32 v0, s0, v0, -16 +; GFX942-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX942-NEXT: v_mov_b32_e32 v1, 15 -; GFX942-NEXT: scratch_store_dword v0, v1, off sc0 sc1 +; GFX942-NEXT: scratch_store_dword v0, v1, off offset:-16 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_endpgm ; @@ -3996,9 +3996,9 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(pt ; UNALIGNED_GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5 ; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v0, s3, v0 ; UNALIGNED_GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; UNALIGNED_GFX9-NEXT: v_add3_u32 v0, s2, v0, -16 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v0, s2, v0 ; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v1, 15 -; UNALIGNED_GFX9-NEXT: scratch_store_dword v0, v1, off +; UNALIGNED_GFX9-NEXT: scratch_store_dword v0, v1, off offset:-16 ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: s_endpgm ; @@ -4018,9 +4018,9 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(pt ; UNALIGNED_GFX942-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: ; UNALIGNED_GFX942: ; %bb.0: ; %bb ; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v0, s1, v0 -; UNALIGNED_GFX942-NEXT: v_add3_u32 v0, s0, v0, -16 +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v0, s0, v0 ; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v1, 15 -; UNALIGNED_GFX942-NEXT: scratch_store_dword v0, v1, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: scratch_store_dword v0, v1, off offset:-16 sc0 sc1 ; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX942-NEXT: s_endpgm ; @@ -4052,8 +4052,7 @@ define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addr ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-NEXT: s_add_u32 s0, s2, 0xffffffe8 -; GFX9-NEXT: scratch_load_dword v2, off, s0 +; GFX9-NEXT: scratch_load_dword v2, off, s2 offset:-24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -4071,8 +4070,7 @@ define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addr ; ; GFX942-LABEL: sgpr_base_negative_offset: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: s_add_u32 s0, s0, 0xffffffe8 -; GFX942-NEXT: scratch_load_dword v2, off, s0 +; GFX942-NEXT: scratch_load_dword v2, off, s0 offset:-24 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_store_dword v[0:1], v2, off ; GFX942-NEXT: s_endpgm @@ -4095,8 +4093,7 @@ define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addr ; UNALIGNED_GFX9: ; %bb.0: ; %entry ; UNALIGNED_GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5 ; UNALIGNED_GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; UNALIGNED_GFX9-NEXT: s_add_u32 s0, s2, 0xffffffe8 -; UNALIGNED_GFX9-NEXT: scratch_load_dword v2, off, s0 +; UNALIGNED_GFX9-NEXT: scratch_load_dword v2, off, s2 offset:-24 ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: global_store_dword v[0:1], v2, off ; UNALIGNED_GFX9-NEXT: s_endpgm @@ -4114,8 +4111,7 @@ define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addr ; ; UNALIGNED_GFX942-LABEL: sgpr_base_negative_offset: ; UNALIGNED_GFX942: ; %bb.0: ; %entry -; UNALIGNED_GFX942-NEXT: s_add_u32 s0, s0, 0xffffffe8 -; UNALIGNED_GFX942-NEXT: scratch_load_dword v2, off, s0 +; UNALIGNED_GFX942-NEXT: scratch_load_dword v2, off, s0 offset:-24 ; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX942-NEXT: global_store_dword v[0:1], v2, off ; UNALIGNED_GFX942-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-neg-offset-bug-155902.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-neg-offset-bug-155902.ll new file mode 100644 index 0000000000000..93ae5f3124e79 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-neg-offset-bug-155902.ll @@ -0,0 +1,232 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -verify-machineinstrs -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 | FileCheck %s --check-prefix=GFX950 + +; Ensure we don't crash with: "Cannot scavenge register in FI elimination!" +define amdgpu_kernel void @issue155902(i64 %arg, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7, i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, i64 %arg13, i64 %arg14, i64 %arg15, i64 %arg16, i64 %arg17, i64 %arg18, i64 %arg19, i64 %arg20, i64 %arg21, i64 %arg22, i64 %arg23, i64 %arg24, i64 %arg25, i64 %arg26, i64 %arg27, i64 %arg28, i64 %arg29, i64 %arg30, i64 %arg31, i64 %arg32, i64 %arg33, i64 %arg34, i64 %arg35, i64 %arg36, i64 %arg37, i64 %arg38, i64 %arg39, i64 %arg40, i64 %arg41, i64 %arg42, i64 %arg43, i64 %arg44, i64 %arg45, i64 %arg46, i64 %arg47, i64 %arg48, i64 %arg49) { +; GFX950-LABEL: issue155902: +; GFX950: ; %bb.0: ; %bb +; GFX950-NEXT: s_mov_b32 s0, 8 +; GFX950-NEXT: s_mov_b32 s1, 0x4008 +; GFX950-NEXT: s_add_i32 s33, s0, s1 +; GFX950-NEXT: s_mov_b64 s[2:3], s[4:5] +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX950-NEXT: s_load_dwordx2 vcc, s[2:3], 0x8 +; GFX950-NEXT: s_load_dwordx2 s[98:99], s[2:3], 0x10 +; GFX950-NEXT: s_load_dwordx2 s[96:97], s[2:3], 0x18 +; GFX950-NEXT: s_load_dwordx2 s[94:95], s[2:3], 0x20 +; GFX950-NEXT: s_load_dwordx2 s[92:93], s[2:3], 0x28 +; GFX950-NEXT: s_load_dwordx2 s[90:91], s[2:3], 0x30 +; GFX950-NEXT: s_load_dwordx2 s[88:89], s[2:3], 0x38 +; GFX950-NEXT: s_load_dwordx2 s[86:87], s[2:3], 0x40 +; GFX950-NEXT: s_load_dwordx2 s[84:85], s[2:3], 0x48 +; GFX950-NEXT: s_load_dwordx2 s[82:83], s[2:3], 0x50 +; GFX950-NEXT: s_load_dwordx2 s[80:81], s[2:3], 0x58 +; GFX950-NEXT: s_load_dwordx2 s[78:79], s[2:3], 0x60 +; GFX950-NEXT: s_load_dwordx2 s[76:77], s[2:3], 0x68 +; GFX950-NEXT: s_load_dwordx2 s[74:75], s[2:3], 0x70 +; GFX950-NEXT: s_load_dwordx2 s[72:73], s[2:3], 0x78 +; GFX950-NEXT: s_load_dwordx2 s[70:71], s[2:3], 0x80 +; GFX950-NEXT: s_load_dwordx2 s[68:69], s[2:3], 0x88 +; GFX950-NEXT: s_load_dwordx2 s[66:67], s[2:3], 0x90 +; GFX950-NEXT: s_load_dwordx2 s[64:65], s[2:3], 0x98 +; GFX950-NEXT: s_load_dwordx2 s[62:63], s[2:3], 0xa0 +; GFX950-NEXT: s_load_dwordx2 s[60:61], s[2:3], 0xa8 +; GFX950-NEXT: s_load_dwordx2 s[58:59], s[2:3], 0xb0 +; GFX950-NEXT: s_load_dwordx2 s[56:57], s[2:3], 0xb8 +; GFX950-NEXT: s_load_dwordx2 s[54:55], s[2:3], 0xc0 +; GFX950-NEXT: s_load_dwordx2 s[52:53], s[2:3], 0xc8 +; GFX950-NEXT: s_load_dwordx2 s[50:51], s[2:3], 0xd0 +; GFX950-NEXT: s_load_dwordx2 s[48:49], s[2:3], 0xd8 +; GFX950-NEXT: s_load_dwordx2 s[46:47], s[2:3], 0xe0 +; GFX950-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0xe8 +; GFX950-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0xf0 +; GFX950-NEXT: s_load_dwordx2 s[40:41], s[2:3], 0xf8 +; GFX950-NEXT: s_load_dwordx2 s[38:39], s[2:3], 0x100 +; GFX950-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x108 +; GFX950-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x110 +; GFX950-NEXT: s_load_dwordx2 s[30:31], s[2:3], 0x118 +; GFX950-NEXT: s_load_dwordx2 s[28:29], s[2:3], 0x120 +; GFX950-NEXT: s_load_dwordx2 s[26:27], s[2:3], 0x128 +; GFX950-NEXT: s_load_dwordx2 s[24:25], s[2:3], 0x130 +; GFX950-NEXT: s_load_dwordx2 s[22:23], s[2:3], 0x138 +; GFX950-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x140 +; GFX950-NEXT: s_load_dwordx2 s[18:19], s[2:3], 0x148 +; GFX950-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x150 +; GFX950-NEXT: s_load_dwordx2 s[14:15], s[2:3], 0x158 +; GFX950-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x160 +; GFX950-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x168 +; GFX950-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x170 +; GFX950-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x178 +; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x180 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x188 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], 0 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s33 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:-8 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], 0x384 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:8 +; GFX950-NEXT: s_mov_b32 s33, 0 +; GFX950-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane +; GFX950-NEXT: v_writelane_b32 v2, s33, 0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: v_readlane_b32 s0, v2, 0 +; GFX950-NEXT: s_nop 4 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[98:99] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[96:97] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[94:95] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[92:93] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[90:91] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[88:89] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[86:87] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[84:85] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[82:83] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[80:81] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[78:79] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[76:77] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[74:75] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[72:73] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[70:71] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[68:69] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[66:67] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[64:65] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[62:63] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[60:61] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[58:59] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[56:57] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[54:55] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[52:53] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[50:51] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[48:49] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[46:47] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[44:45] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[42:43] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[40:41] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[38:39] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[34:35] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[30:31] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[28:29] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[26:27] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[22:23] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[20:21] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[18:19] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[16:17] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[14:15] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[10:11] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: s_endpgm +bb: + %alloca.big = alloca [4096 x i32], align 4, addrspace(5) + %alloca304 = alloca [2 x i64], align 8, addrspace(5) + %alloca307 = alloca i64, align 8, addrspace(5) + store [2 x i64] zeroinitializer, ptr addrspace(5) %alloca304, align 8 + store i64 900, ptr addrspace(5) %alloca307, align 8 + store i64 %arg, ptr addrspace(5) null, align 8 + store i64 %arg1, ptr addrspace(5) null, align 8 + store i64 %arg2, ptr addrspace(5) null, align 8 + store i64 %arg3, ptr addrspace(5) null, align 8 + store i64 %arg4, ptr addrspace(5) null, align 8 + store i64 %arg5, ptr addrspace(5) null, align 8 + store i64 %arg6, ptr addrspace(5) null, align 8 + store i64 %arg7, ptr addrspace(5) null, align 8 + store i64 %arg8, ptr addrspace(5) null, align 8 + store i64 %arg9, ptr addrspace(5) null, align 8 + store i64 %arg10, ptr addrspace(5) null, align 8 + store i64 %arg11, ptr addrspace(5) null, align 8 + store i64 %arg12, ptr addrspace(5) null, align 8 + store i64 %arg13, ptr addrspace(5) null, align 8 + store i64 %arg14, ptr addrspace(5) null, align 8 + store i64 %arg15, ptr addrspace(5) null, align 8 + store i64 %arg16, ptr addrspace(5) null, align 8 + store i64 %arg17, ptr addrspace(5) null, align 8 + store i64 %arg18, ptr addrspace(5) null, align 8 + store i64 %arg19, ptr addrspace(5) null, align 8 + store i64 %arg20, ptr addrspace(5) null, align 8 + store i64 %arg21, ptr addrspace(5) null, align 8 + store i64 %arg22, ptr addrspace(5) null, align 8 + store i64 %arg23, ptr addrspace(5) null, align 8 + store i64 %arg24, ptr addrspace(5) null, align 8 + store i64 %arg25, ptr addrspace(5) null, align 8 + store i64 %arg26, ptr addrspace(5) null, align 8 + store i64 %arg27, ptr addrspace(5) null, align 8 + store i64 %arg28, ptr addrspace(5) null, align 8 + store i64 %arg29, ptr addrspace(5) null, align 8 + store i64 %arg30, ptr addrspace(5) null, align 8 + store i64 %arg31, ptr addrspace(5) null, align 8 + store i64 %arg32, ptr addrspace(5) null, align 8 + store i64 %arg33, ptr addrspace(5) null, align 8 + store i64 %arg34, ptr addrspace(5) null, align 8 + store i64 %arg35, ptr addrspace(5) null, align 8 + store i64 %arg36, ptr addrspace(5) null, align 8 + store i64 %arg37, ptr addrspace(5) null, align 8 + store i64 %arg38, ptr addrspace(5) null, align 8 + store i64 %arg39, ptr addrspace(5) null, align 8 + store i64 %arg40, ptr addrspace(5) null, align 8 + store i64 %arg41, ptr addrspace(5) null, align 8 + store i64 %arg42, ptr addrspace(5) null, align 8 + store i64 %arg43, ptr addrspace(5) null, align 8 + store i64 %arg44, ptr addrspace(5) null, align 8 + store i64 %arg45, ptr addrspace(5) null, align 8 + store i64 %arg46, ptr addrspace(5) null, align 8 + store i64 %arg47, ptr addrspace(5) null, align 8 + store i64 %arg48, ptr addrspace(5) null, align 8 + store i64 %arg49, ptr addrspace(5) null, align 8 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll index e01cb79382c05..737c811cd9d93 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -1582,8 +1582,7 @@ define amdgpu_kernel void @soff1_voff1_negative(i32 %soff) { ; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-SDAG-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX942-SDAG-NEXT: v_add_u32_e32 v0, -1, v0 -; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:-1 sc0 sc1 ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX942-SDAG-NEXT: s_endpgm ; @@ -1593,8 +1592,8 @@ define amdgpu_kernel void @soff1_voff1_negative(i32 %soff) { ; GFX942-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1 ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_add3_u32 v0, s0, v0, -1 -; GFX942-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX942-GISEL-NEXT: scratch_store_byte v0, v1, off offset:-1 sc0 sc1 ; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX942-GISEL-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index b2e9831d6c84f..c02130736e3d9 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -4561,11 +4561,10 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg) ; GFX9-LABEL: store_load_i32_negative_unaligned: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, -1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 -; GFX9-NEXT: scratch_store_byte v0, v1, off +; GFX9-NEXT: scratch_store_byte v0, v1, off offset:-1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: scratch_load_ubyte v0, v0, off glc +; GFX9-NEXT: scratch_load_ubyte v0, v0, off offset:-1 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4632,22 +4631,20 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg) ; GFX9-PAL-LABEL: store_load_i32_negative_unaligned: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-PAL-NEXT: v_add_u32_e32 v0, -1, v0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 -; GFX9-PAL-NEXT: scratch_store_byte v0, v1, off +; GFX9-PAL-NEXT: scratch_store_byte v0, v1, off offset:-1 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: scratch_load_ubyte v0, v0, off glc +; GFX9-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-1 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: store_load_i32_negative_unaligned: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_add_u32_e32 v0, -1, v0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX942-NEXT: scratch_store_byte v0, v1, off offset:-1 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: scratch_load_ubyte v0, v0, off sc0 sc1 +; GFX942-NEXT: scratch_load_ubyte v0, v0, off offset:-1 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -4732,11 +4729,11 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture ; GFX9-LABEL: store_load_i32_large_negative_unaligned: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, 0xffffef7f, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 0xfffff000, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 -; GFX9-NEXT: scratch_store_byte v0, v1, off +; GFX9-NEXT: scratch_store_byte v0, v1, off offset:-129 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: scratch_load_ubyte v0, v0, off glc +; GFX9-NEXT: scratch_load_ubyte v0, v0, off offset:-129 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -4805,22 +4802,22 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture ; GFX9-PAL-LABEL: store_load_i32_large_negative_unaligned: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-PAL-NEXT: v_add_u32_e32 v0, 0xffffef7f, v0 +; GFX9-PAL-NEXT: v_add_u32_e32 v0, 0xfffff000, v0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 -; GFX9-PAL-NEXT: scratch_store_byte v0, v1, off +; GFX9-PAL-NEXT: scratch_store_byte v0, v1, off offset:-129 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: scratch_load_ubyte v0, v0, off glc +; GFX9-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-129 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-LABEL: store_load_i32_large_negative_unaligned: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_add_u32_e32 v0, 0xffffef7f, v0 +; GFX942-NEXT: v_add_u32_e32 v0, 0xfffff000, v0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX942-NEXT: scratch_store_byte v0, v1, off offset:-129 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: scratch_load_ubyte v0, v0, off sc0 sc1 +; GFX942-NEXT: scratch_load_ubyte v0, v0, off offset:-129 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -5485,9 +5482,8 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(pt ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_add_i32 s2, s2, s3 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 -; GFX9-NEXT: v_add_u32_e32 v0, -16, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 15 -; GFX9-NEXT: scratch_store_dword v0, v1, off +; GFX9-NEXT: scratch_store_dword v0, v1, off offset:-16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; @@ -5531,8 +5527,7 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(pt ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-PAL-NEXT: s_add_i32 s0, s0, s1 ; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX9-PAL-NEXT: v_add_u32_e32 v0, -16, v0 -; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off +; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:-16 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; @@ -5540,9 +5535,8 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(pt ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_add_i32 s0, s0, s1 ; GFX942-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX942-NEXT: v_add_u32_e32 v0, -16, v0 ; GFX942-NEXT: v_mov_b32_e32 v1, 15 -; GFX942-NEXT: scratch_store_dword v0, v1, off sc0 sc1 +; GFX942-NEXT: scratch_store_dword v0, v1, off offset:-16 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_endpgm ; @@ -5591,8 +5585,7 @@ define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addr ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s5 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-NEXT: s_addk_i32 s2, 0xffe8 -; GFX9-NEXT: scratch_load_dword v2, off, s2 +; GFX9-NEXT: scratch_load_dword v2, off, s2 offset:-24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -5631,16 +5624,14 @@ define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addr ; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s5 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 -; GFX9-PAL-NEXT: s_addk_i32 s0, 0xffe8 -; GFX9-PAL-NEXT: scratch_load_dword v2, off, s0 +; GFX9-PAL-NEXT: scratch_load_dword v2, off, s0 offset:-24 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-PAL-NEXT: s_endpgm ; ; GFX942-LABEL: sgpr_base_negative_offset: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: s_addk_i32 s0, 0xffe8 -; GFX942-NEXT: scratch_load_dword v2, off, s0 +; GFX942-NEXT: scratch_load_dword v2, off, s0 offset:-24 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_store_dword v[0:1], v2, off ; GFX942-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll index 5f0ca7bc42ae0..b23be8501b97e 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -294,28 +294,29 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out ; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 ; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 -; FLATSCR-NEXT: s_mov_b32 s0, 0 -; FLATSCR-NEXT: scratch_store_dword off, v0, s0 offset:1024 +; FLATSCR-NEXT: s_movk_i32 s1, 0x2000 +; FLATSCR-NEXT: s_movk_i32 s0, 0x4010 +; FLATSCR-NEXT: scratch_store_dword off, v0, s1 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s1, 0 ; FLATSCR-NEXT: .LBB2_1: ; %loadstoreloop ; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 -; FLATSCR-NEXT: s_add_i32 s1, s0, 0x2000 -; FLATSCR-NEXT: s_add_i32 s0, s0, 1 -; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2120 -; FLATSCR-NEXT: scratch_store_byte off, v0, s1 +; FLATSCR-NEXT: s_add_i32 s2, s1, 0x4000 +; FLATSCR-NEXT: s_add_i32 s1, s1, 1 +; FLATSCR-NEXT: s_cmpk_lt_u32 s1, 0x2120 +; FLATSCR-NEXT: scratch_store_byte off, v0, s2 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_cbranch_scc1 .LBB2_1 ; FLATSCR-NEXT: ; %bb.2: ; %split -; FLATSCR-NEXT: s_movk_i32 s0, 0x1000 -; FLATSCR-NEXT: s_addk_i32 s0, 0x2000 -; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s0 offset:720 glc +; FLATSCR-NEXT: s_movk_i32 s1, 0x1000 +; FLATSCR-NEXT: s_addk_i32 s1, 0x4000 +; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s1 offset:720 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 offset:704 glc +; FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s1 offset:704 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_movk_i32 s0, 0x2000 -; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s0 offset:16 glc +; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s0 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_load_dwordx4 v[4:7], off, s0 glc +; FLATSCR-NEXT: scratch_load_dwordx4 v[4:7], off, s0 offset:-16 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; FLATSCR-NEXT: v_mov_b32_e32 v12, 0