Skip to content

Commit

Permalink
AMDGPU: Assume all amdhsa kernarg passed implicit arguments by default
Browse files Browse the repository at this point in the history
Previously we would require adding an attribute to kernels to enable
the inputs passed in the kernarg segment, accessed by
llvm.amdgcn.implicitarg.ptr. This violates the principle of being
correct by default. Some OpenMP testcases were broken recently since
it wasn't correctly setting this attribute, and no known frontends are
setting this to anything other than the maximum.

Most of the test changes are from load widening of argument loads
since there now more implied dereferenceable bytes.
  • Loading branch information
arsenm committed Dec 4, 2021
1 parent ae0ba7d commit 2959e08
Show file tree
Hide file tree
Showing 12 changed files with 1,798 additions and 1,745 deletions.
6 changes: 5 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Expand Up @@ -648,14 +648,18 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
}

unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
assert(AMDGPU::isKernel(F.getCallingConv()));

// We don't allocate the segment if we know the implicit arguments weren't
// used, even if the ABI implies we need them.
if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
return 0;

if (isMesaKernel(F))
return 16;
return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);

// Assume all implicit inputs are used by default
return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 56);
}

uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
Expand Down
Expand Up @@ -9,34 +9,35 @@ declare hidden void @extern()
define amdgpu_kernel void @kernel_call_no_workitem_ids() {
; CHECK-LABEL: name: kernel_call_no_workitem_ids
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr14
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr13
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr12
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr16
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @extern
; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(p4) = COPY [[COPY5]]
; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY4]]
; CHECK-NEXT: [[C:%[0-9]+]]:_(p4) = G_CONSTANT i64 0
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[C]], [[C1]](s64)
; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s64) = COPY [[COPY3]]
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[COPY2]]
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY]]
; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY12]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY6]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY7]](p4)
; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY5]]
; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY4]]
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY9]], [[C]](s64)
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s64) = COPY [[COPY3]]
; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY2]]
; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY]]
; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY14]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY7]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY8]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY8]](s64)
; CHECK-NEXT: $sgpr12 = COPY [[COPY9]](s32)
; CHECK-NEXT: $sgpr13 = COPY [[COPY10]](s32)
; CHECK-NEXT: $sgpr14 = COPY [[COPY11]](s32)
; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY10]](s64)
; CHECK-NEXT: $sgpr12 = COPY [[COPY11]](s32)
; CHECK-NEXT: $sgpr13 = COPY [[COPY12]](s32)
; CHECK-NEXT: $sgpr14 = COPY [[COPY13]](s32)
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @extern, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: S_ENDPGM 0
Expand All @@ -47,37 +48,38 @@ define amdgpu_kernel void @kernel_call_no_workitem_ids() {
define amdgpu_kernel void @kernel_call_no_workgroup_ids() {
; CHECK-LABEL: name: kernel_call_no_workgroup_ids
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @extern
; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(p4) = COPY [[COPY5]]
; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY4]]
; CHECK-NEXT: [[C:%[0-9]+]]:_(p4) = G_CONSTANT i64 0
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[C]], [[C1]](s64)
; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s64) = COPY [[COPY3]]
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY10]], [[C2]](s32)
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY9]], [[SHL]]
; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY11]], [[C3]](s32)
; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY5]]
; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY4]]
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY9]], [[C]](s64)
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s64) = COPY [[COPY3]]
; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY12]], [[C1]](s32)
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY11]], [[SHL]]
; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY13]], [[C2]](s32)
; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY12]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY6]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY7]](p4)
; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY14]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY7]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY8]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY8]](s64)
; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY10]](s64)
; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32)
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @extern, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
Expand All @@ -89,27 +91,28 @@ define amdgpu_kernel void @kernel_call_no_workgroup_ids() {
define amdgpu_kernel void @kernel_call_no_other_sgprs() {
; CHECK-LABEL: name: kernel_call_no_other_sgprs
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr8_sgpr9
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @extern
; CHECK-NEXT: [[C:%[0-9]+]]:_(p4) = G_CONSTANT i64 0
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[C]], [[C1]](s64)
; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C2]](s32)
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY3]], [[SHL]]
; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C3]](s32)
; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(p4) = COPY [[COPY3]](p4)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY4]], [[C]](s64)
; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY6]], [[C1]](s32)
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY5]], [[SHL]]
; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY7]], [[C2]](s32)
; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY6]](<4 x s32>)
; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY8]](<4 x s32>)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32)
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @extern, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr8_sgpr9, implicit $vgpr31
Expand Down

0 comments on commit 2959e08

Please sign in to comment.