Skip to content

Commit

Permalink
[AMDGPU] Stop combining arbitrary offsets into PAL relocs (#80034)
Browse files Browse the repository at this point in the history
PAL uses ELF REL (not RELA) relocations which can only store a 32-bit
addend in the instruction, even for reloc types like R_AMDGPU_ABS32_HI
which require the upper 32 bits of a 64-bit address calculation to be
correct. This means that it is not safe to fold an arbitrary offset into
a GlobalAddressSDNode, so stop doing that.

In practice this is mostly a problem for small negative offsets which do
not work as expected because PAL treats the 32-bit addend as unsigned.
  • Loading branch information
jayfoad committed Jan 31, 2024
1 parent 50e80e0 commit c2c650f
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 16 deletions.
11 changes: 11 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7133,6 +7133,17 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,

bool
SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
// OSes that use ELF REL relocations (instead of RELA) can only store a
// 32-bit addend in the instruction, so it is not safe to allow offset folding
// which can create arbitrary 64-bit addends. (This is only a problem for
// R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
// the high 32 bits of the addend.)
//
// This should be kept in sync with how HasRelocationAddend is initialized in
// the constructor of ELFAMDGPUAsmBackend.
if (!Subtarget->isAmdHsaOS())
return false;

// We can fold offsets for anything that doesn't require a GOT relocation.
return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
Expand Down
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/AMDGPU/global-constant.ll
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@
; GCN-DEFAULT: s_addc_u32 s{{[0-9]+}}, s[[PC1_HI]], private2@rel32@hi+12

; MESA uses absolute relocations.
; GCN-MESA: s_add_u32 s2, s4, private1@abs32@lo
; GCN-MESA: s_addc_u32 s3, s5, private1@abs32@hi
; GCN-MESA: s_add_u32 s2, private1@abs32@lo, s4
; GCN-MESA: s_addc_u32 s3, private1@abs32@hi, s5

; PAL uses absolute relocations.
; GCN-PAL: s_add_u32 s2, s4, private1@abs32@lo
; GCN-PAL: s_addc_u32 s3, s5, private1@abs32@hi
; GCN-PAL: s_add_u32 s4, s4, private2@abs32@lo
; GCN-PAL: s_addc_u32 s5, s5, private2@abs32@hi
; GCN-PAL: s_add_u32 s2, private1@abs32@lo, s4
; GCN-PAL: s_addc_u32 s3, private1@abs32@hi, s5
; GCN-PAL: s_add_u32 s4, private2@abs32@lo, s4
; GCN-PAL: s_addc_u32 s5, private2@abs32@hi, s5

; R600-LABEL: private_test
define amdgpu_kernel void @private_test(i32 %index, ptr addrspace(1) %out) {
Expand All @@ -44,13 +44,13 @@ define amdgpu_kernel void @private_test(i32 %index, ptr addrspace(1) %out) {
; GCN-DEFAULT: s_add_u32 s{{[0-9]+}}, s[[PC0_LO]], available_externally@gotpcrel32@lo+4
; GCN-DEFAULT: s_addc_u32 s{{[0-9]+}}, s[[PC0_HI]], available_externally@gotpcrel32@hi+12

; GCN-MESA: s_mov_b32 s1, available_externally@abs32@hi+4
; GCN-MESA: s_mov_b32 s0, available_externally@abs32@lo+4
; GCN-MESA: s_mov_b32 s1, available_externally@abs32@hi
; GCN-MESA: s_mov_b32 s0, available_externally@abs32@lo

; R600-LABEL: available_externally_test

; GCN-PAL: s_mov_b32 s3, available_externally@abs32@hi+4
; GCN-PAL: s_mov_b32 s2, available_externally@abs32@lo+4
; GCN-PAL: s_mov_b32 s3, available_externally@abs32@hi
; GCN-PAL: s_mov_b32 s2, available_externally@abs32@lo
define amdgpu_kernel void @available_externally_test(ptr addrspace(1) %out) {
%ptr = getelementptr [256 x i32], ptr addrspace(4) @available_externally, i32 0, i32 1
%val = load i32, ptr addrspace(4) %ptr
Expand Down
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll
Original file line number Diff line number Diff line change
Expand Up @@ -315,10 +315,9 @@ define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align16(ptr ad

; FUNC-LABEL: {{^}}test_memcpy_const_string_align4:
; SI: s_getpc_b64
; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, hello.align4@rel32@lo+20
; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, hello.align4@rel32@lo+4
; SI: s_addc_u32
; SI-DAG: s_load_dwordx4
; SI-DAG: s_load_dwordx4
; SI-DAG: s_load_dwordx8
; SI-DAG: s_load_dwordx2
; SI-DAG: buffer_store_dwordx4
; SI-DAG: buffer_store_dwordx4
Expand Down
19 changes: 16 additions & 3 deletions llvm/test/CodeGen/AMDGPU/rel32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,22 @@

; CHECK-LABEL: rel32_neg_offset:
; CHECK: s_getpc_b64 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
; CHECK-NEXT: s_add_u32 s[[LO]], s[[LO]], g@rel32@lo-4
; CHECK-NEXT: s_addc_u32 s[[HI]], s[[HI]], g@rel32@hi+4
; CHECK-NEXT: s_add_u32 s[[LO]], s[[LO]], g@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s[[HI]], s[[HI]], g@rel32@hi+12
; CHECK-NEXT: s_add_u32 s[[LO]], s[[LO]], -16
; CHECK-NEXT: s_addc_u32 s[[HI]], s[[HI]], -1
define ptr addrspace(4) @rel32_neg_offset() {
%r = getelementptr i32, ptr addrspace(4) @g, i64 -2
%r = getelementptr i32, ptr addrspace(4) @g, i64 -4
ret ptr addrspace(4) %r
}

; CHECK-LABEL: rel32_large_offset:
; CHECK: s_getpc_b64 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
; CHECK-NEXT: s_add_u32 s[[LO]], s[[LO]], g@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s[[HI]], s[[HI]], g@rel32@hi+12
; CHECK-NEXT: s_add_u32 s[[LO]], s[[LO]], 0x502f9000
; CHECK-NEXT: s_addc_u32 s[[HI]], s[[HI]], 9
define ptr addrspace(4) @rel32_large_offset() {
%r = getelementptr i32, ptr addrspace(4) @g, i64 10000000000
ret ptr addrspace(4) %r
}

0 comments on commit c2c650f

Please sign in to comment.