Original file line number Diff line number Diff line change
Expand Up @@ -9,34 +9,35 @@ declare hidden void @extern()
define amdgpu_kernel void @kernel_call_no_workitem_ids() {
; CHECK-LABEL: name: kernel_call_no_workitem_ids
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr14
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr13
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr12
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr16
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @extern
; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(p4) = COPY [[COPY5]]
; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY4]]
; CHECK-NEXT: [[C:%[0-9]+]]:_(p4) = G_CONSTANT i64 0
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[C]], [[C1]](s64)
; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s64) = COPY [[COPY3]]
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[COPY2]]
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY]]
; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY12]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY6]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY7]](p4)
; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY5]]
; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY4]]
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY9]], [[C]](s64)
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s64) = COPY [[COPY3]]
; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY2]]
; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]]
; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY]]
; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY14]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY7]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY8]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY8]](s64)
; CHECK-NEXT: $sgpr12 = COPY [[COPY9]](s32)
; CHECK-NEXT: $sgpr13 = COPY [[COPY10]](s32)
; CHECK-NEXT: $sgpr14 = COPY [[COPY11]](s32)
; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY10]](s64)
; CHECK-NEXT: $sgpr12 = COPY [[COPY11]](s32)
; CHECK-NEXT: $sgpr13 = COPY [[COPY12]](s32)
; CHECK-NEXT: $sgpr14 = COPY [[COPY13]](s32)
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @extern, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: S_ENDPGM 0
Expand All @@ -47,37 +48,38 @@ define amdgpu_kernel void @kernel_call_no_workitem_ids() {
define amdgpu_kernel void @kernel_call_no_workgroup_ids() {
; CHECK-LABEL: name: kernel_call_no_workgroup_ids
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9
; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @extern
; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(p4) = COPY [[COPY5]]
; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY4]]
; CHECK-NEXT: [[C:%[0-9]+]]:_(p4) = G_CONSTANT i64 0
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[C]], [[C1]](s64)
; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s64) = COPY [[COPY3]]
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY10]], [[C2]](s32)
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY9]], [[SHL]]
; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY11]], [[C3]](s32)
; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(p4) = COPY [[COPY5]]
; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(p4) = COPY [[COPY4]]
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY6]](p4)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY9]], [[C]](s64)
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s64) = COPY [[COPY3]]
; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY12]], [[C1]](s32)
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY11]], [[SHL]]
; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY13]], [[C2]](s32)
; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY12]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY6]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY7]](p4)
; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY14]](<4 x s32>)
; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY7]](p4)
; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY8]](p4)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY8]](s64)
; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY10]](s64)
; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32)
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @extern, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $vgpr31
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
Expand All @@ -89,27 +91,28 @@ define amdgpu_kernel void @kernel_call_no_workgroup_ids() {
define amdgpu_kernel void @kernel_call_no_other_sgprs() {
; CHECK-LABEL: name: kernel_call_no_other_sgprs
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr8_sgpr9
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @extern
; CHECK-NEXT: [[C:%[0-9]+]]:_(p4) = G_CONSTANT i64 0
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[C]], [[C1]](s64)
; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C2]](s32)
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY3]], [[SHL]]
; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C3]](s32)
; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(p4) = COPY [[COPY3]](p4)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY4]], [[C]](s64)
; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY6]], [[C1]](s32)
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY5]], [[SHL]]
; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY7]], [[C2]](s32)
; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY6]](<4 x s32>)
; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY8]](<4 x s32>)
; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32)
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV]](p0), @extern, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr8_sgpr9, implicit $vgpr31
Expand Down
3,011 changes: 1,527 additions & 1,484 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -74,14 +74,9 @@ define amdgpu_kernel void @opencl_test_implicit_alignment(i32 addrspace(1)* %out
ret void
}

; Mesa implies 16-bytes are always allocated, hsa requires the
; attribute for the additional space.
; ALL-LABEL: {{^}}test_no_kernargs:
; HSA: enable_sgpr_kernarg_segment_ptr = 0
; HSA: kernarg_segment_byte_size = 0

; OS-MESA3D: enable_sgpr_kernarg_segment_ptr = 1
; OS-MESA3D: kernarg_segment_byte_size = 16
; CO-V2: enable_sgpr_kernarg_segment_ptr = 0
; CO-V2: kernarg_segment_byte_size = 0
; CO-V2: kernarg_segment_alignment = 4

; HSA: s_mov_b64 [[OFFSET_NULL:s\[[0-9]+:[0-9]+\]]], 40{{$}}
Expand All @@ -97,7 +92,7 @@ define amdgpu_kernel void @test_no_kernargs() #1 {

; ALL-LABEL: {{^}}opencl_test_implicit_alignment_no_explicit_kernargs:
; HSA: kernarg_segment_byte_size = 48
; OS-MESA3d: kernarg_segment_byte_size = 16
; OS-MESA3D: kernarg_segment_byte_size = 16
; CO-V2: kernarg_segment_alignment = 4
define amdgpu_kernel void @opencl_test_implicit_alignment_no_explicit_kernargs() #2 {
%implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
Expand Down Expand Up @@ -131,6 +126,6 @@ declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0

attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
attributes #1 = { nounwind "amdgpu-implicitarg-num-bytes"="0" }
attributes #2 = { nounwind "amdgpu-implicitarg-num-bytes"="48" }
attributes #3 = { nounwind "amdgpu-implicitarg-num-bytes"="38" }
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ define amdgpu_kernel void @kern_indirect_use_every_sgpr_input(i8) #1 {
; GCN: .amdhsa_system_sgpr_workgroup_id_z 1
; GCN: .amdhsa_system_sgpr_workgroup_info 0
; GCN: .amdhsa_system_vgpr_workitem_id 2
define amdgpu_kernel void @kern_indirect_use_every_sgpr_input_no_kernargs() #1 {
define amdgpu_kernel void @kern_indirect_use_every_sgpr_input_no_kernargs() #2 {
call void @use_every_sgpr_input()
ret void
}
Expand Down Expand Up @@ -361,3 +361,4 @@ declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0

attributes #0 = { nounwind readnone speculatable }
attributes #1 = { nounwind noinline }
attributes #2 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="0" }
3 changes: 2 additions & 1 deletion llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -478,7 +478,7 @@ define amdgpu_kernel void @kern_indirect_use_every_sgpr_input(i8) #1 {
; GCN: s_mov_b64 s[8:9], 0{{$}}
; GCN: s_mov_b32 s32, 0
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_every_sgpr_input_no_kernargs() #1 {
define amdgpu_kernel void @kern_indirect_use_every_sgpr_input_no_kernargs() #2 {
call void @use_every_sgpr_input()
ret void
}
Expand Down Expand Up @@ -613,3 +613,4 @@ declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0

attributes #0 = { nounwind readnone speculatable }
attributes #1 = { nounwind noinline }
attributes #2 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="0" }
96 changes: 48 additions & 48 deletions llvm/test/CodeGen/AMDGPU/cc-update.ll
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,12 @@ entry:
define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
; GFX803-LABEL: test_kern_call:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_add_i32 s10, s10, s15
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8
; GFX803-NEXT: s_add_u32 s0, s0, s15
; GFX803-NEXT: s_add_i32 s12, s12, s17
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_add_u32 s0, s0, s17
; GFX803-NEXT: s_addc_u32 s1, s1, 0
; GFX803-NEXT: s_mov_b32 s32, 0
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX803-NEXT: s_getpc_b64 s[4:5]
; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
Expand All @@ -69,9 +69,9 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
;
; GFX900-LABEL: test_kern_call:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; GFX900-NEXT: s_add_u32 s0, s0, s15
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX900-NEXT: s_add_u32 s0, s0, s17
; GFX900-NEXT: s_addc_u32 s1, s1, 0
; GFX900-NEXT: s_mov_b32 s32, 0
; GFX900-NEXT: s_getpc_b64 s[4:5]
Expand All @@ -82,12 +82,12 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
;
; GFX1010-LABEL: test_kern_call:
; GFX1010: ; %bb.0: ; %entry
; GFX1010-NEXT: s_add_u32 s10, s10, s15
; GFX1010-NEXT: s_add_u32 s12, s12, s17
; GFX1010-NEXT: s_mov_b32 s32, 0
; GFX1010-NEXT: s_addc_u32 s11, s11, 0
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
; GFX1010-NEXT: s_add_u32 s0, s0, s15
; GFX1010-NEXT: s_addc_u32 s13, s13, 0
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX1010-NEXT: s_add_u32 s0, s0, s17
; GFX1010-NEXT: s_addc_u32 s1, s1, 0
; GFX1010-NEXT: s_getpc_b64 s[4:5]
; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
Expand All @@ -102,13 +102,13 @@ entry:
define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
; GFX803-LABEL: test_kern_stack_and_call:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_add_i32 s10, s10, s15
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8
; GFX803-NEXT: s_add_u32 s0, s0, s15
; GFX803-NEXT: s_add_i32 s12, s12, s17
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_add_u32 s0, s0, s17
; GFX803-NEXT: s_addc_u32 s1, s1, 0
; GFX803-NEXT: v_mov_b32_e32 v0, 0
; GFX803-NEXT: s_movk_i32 s32, 0x400
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_getpc_b64 s[4:5]
Expand All @@ -119,9 +119,9 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
;
; GFX900-LABEL: test_kern_stack_and_call:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; GFX900-NEXT: s_add_u32 s0, s0, s15
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX900-NEXT: s_add_u32 s0, s0, s17
; GFX900-NEXT: s_addc_u32 s1, s1, 0
; GFX900-NEXT: v_mov_b32_e32 v0, 0
; GFX900-NEXT: s_movk_i32 s32, 0x400
Expand All @@ -135,13 +135,13 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
;
; GFX1010-LABEL: test_kern_stack_and_call:
; GFX1010: ; %bb.0: ; %entry
; GFX1010-NEXT: s_add_u32 s10, s10, s15
; GFX1010-NEXT: s_add_u32 s12, s12, s17
; GFX1010-NEXT: s_movk_i32 s32, 0x200
; GFX1010-NEXT: s_addc_u32 s11, s11, 0
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
; GFX1010-NEXT: s_addc_u32 s13, s13, 0
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX1010-NEXT: v_mov_b32_e32 v0, 0
; GFX1010-NEXT: s_add_u32 s0, s0, s15
; GFX1010-NEXT: s_add_u32 s0, s0, s17
; GFX1010-NEXT: s_addc_u32 s1, s1, 0
; GFX1010-NEXT: s_getpc_b64 s[4:5]
; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
Expand Down Expand Up @@ -215,13 +215,13 @@ entry:
define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
; GFX803-LABEL: test_force_fp_kern_call:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_add_i32 s10, s10, s15
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8
; GFX803-NEXT: s_add_u32 s0, s0, s15
; GFX803-NEXT: s_add_i32 s12, s12, s17
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_add_u32 s0, s0, s17
; GFX803-NEXT: s_addc_u32 s1, s1, 0
; GFX803-NEXT: s_mov_b32 s32, 0
; GFX803-NEXT: s_mov_b32 s33, 0
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX803-NEXT: s_getpc_b64 s[4:5]
; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
Expand All @@ -230,9 +230,9 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
;
; GFX900-LABEL: test_force_fp_kern_call:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; GFX900-NEXT: s_add_u32 s0, s0, s15
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX900-NEXT: s_add_u32 s0, s0, s17
; GFX900-NEXT: s_addc_u32 s1, s1, 0
; GFX900-NEXT: s_mov_b32 s32, 0
; GFX900-NEXT: s_mov_b32 s33, 0
Expand All @@ -244,13 +244,13 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
;
; GFX1010-LABEL: test_force_fp_kern_call:
; GFX1010: ; %bb.0: ; %entry
; GFX1010-NEXT: s_add_u32 s10, s10, s15
; GFX1010-NEXT: s_add_u32 s12, s12, s17
; GFX1010-NEXT: s_mov_b32 s32, 0
; GFX1010-NEXT: s_mov_b32 s33, 0
; GFX1010-NEXT: s_addc_u32 s11, s11, 0
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
; GFX1010-NEXT: s_add_u32 s0, s0, s15
; GFX1010-NEXT: s_addc_u32 s13, s13, 0
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX1010-NEXT: s_add_u32 s0, s0, s17
; GFX1010-NEXT: s_addc_u32 s1, s1, 0
; GFX1010-NEXT: s_getpc_b64 s[4:5]
; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
Expand All @@ -265,14 +265,14 @@ entry:
define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_addr #2 {
; GFX803-LABEL: test_force_fp_kern_stack_and_call:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_add_i32 s10, s10, s15
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8
; GFX803-NEXT: s_add_u32 s0, s0, s15
; GFX803-NEXT: s_add_i32 s12, s12, s17
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_add_u32 s0, s0, s17
; GFX803-NEXT: s_mov_b32 s33, 0
; GFX803-NEXT: s_addc_u32 s1, s1, 0
; GFX803-NEXT: v_mov_b32_e32 v0, 0
; GFX803-NEXT: s_movk_i32 s32, 0x400
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_getpc_b64 s[4:5]
Expand All @@ -283,9 +283,9 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
;
; GFX900-LABEL: test_force_fp_kern_stack_and_call:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; GFX900-NEXT: s_add_u32 s0, s0, s15
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX900-NEXT: s_add_u32 s0, s0, s17
; GFX900-NEXT: s_mov_b32 s33, 0
; GFX900-NEXT: s_addc_u32 s1, s1, 0
; GFX900-NEXT: v_mov_b32_e32 v0, 0
Expand All @@ -300,14 +300,14 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
;
; GFX1010-LABEL: test_force_fp_kern_stack_and_call:
; GFX1010: ; %bb.0: ; %entry
; GFX1010-NEXT: s_add_u32 s10, s10, s15
; GFX1010-NEXT: s_add_u32 s12, s12, s17
; GFX1010-NEXT: s_movk_i32 s32, 0x200
; GFX1010-NEXT: s_mov_b32 s33, 0
; GFX1010-NEXT: s_addc_u32 s11, s11, 0
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
; GFX1010-NEXT: s_addc_u32 s13, s13, 0
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX1010-NEXT: v_mov_b32_e32 v0, 0
; GFX1010-NEXT: s_add_u32 s0, s0, s15
; GFX1010-NEXT: s_add_u32 s0, s0, s17
; GFX1010-NEXT: s_addc_u32 s1, s1, 0
; GFX1010-NEXT: s_getpc_b64 s[4:5]
; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,11 @@ define amdgpu_kernel void @stack_object_in_kernel_no_calls() {
define amdgpu_kernel void @kernel_calls_no_stack() {
; FLAT_SCR_OPT-LABEL: kernel_calls_no_stack:
; FLAT_SCR_OPT: ; %bb.0:
; FLAT_SCR_OPT-NEXT: s_add_u32 s6, s6, s11
; FLAT_SCR_OPT-NEXT: s_add_u32 s8, s8, s13
; FLAT_SCR_OPT-NEXT: s_mov_b32 s32, 0
; FLAT_SCR_OPT-NEXT: s_addc_u32 s7, s7, 0
; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
; FLAT_SCR_OPT-NEXT: s_addc_u32 s9, s9, 0
; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
; FLAT_SCR_OPT-NEXT: s_getpc_b64 s[0:1]
; FLAT_SCR_OPT-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4
; FLAT_SCR_OPT-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-v3.ll
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ define amdgpu_kernel void @test_enqueue_kernel_caller(i8 %a) #1
; CHECK-NEXT: - 0
; CHECK-NOT: amdhsa.printf:

attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" }
attributes #1 = { "calls-enqueue-kernel" "amdgpu-implicitarg-num-bytes"="48" }
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" }
attributes #1 = { optnone noinline "calls-enqueue-kernel" "amdgpu-implicitarg-num-bytes"="48" }

!1 = !{i32 0}
!2 = !{!"none"}
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel.ll
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@ define amdgpu_kernel void @test_enqueue_kernel_caller(i8 %a) #1
ret void
}

attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" }
attributes #1 = { "calls-enqueue-kernel" "amdgpu-implicitarg-num-bytes"="48" }
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" }
attributes #1 = { optnone noinline "calls-enqueue-kernel" "amdgpu-implicitarg-num-bytes"="48" }

!1 = !{i32 0}
!2 = !{!"none"}
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1894,9 +1894,9 @@ define amdgpu_kernel void @unknown_addrspace_kernarg(i32 addrspace(12345)* %ptr)
; CHECK-NEXT: - 1
; CHECK-NEXT: - 0

attributes #0 = { "amdgpu-implicitarg-num-bytes"="56" }
attributes #1 = { "amdgpu-implicitarg-num-bytes"="56" "runtime-handle"="__test_block_invoke_kernel_runtime_handle" }
attributes #2 = { "amdgpu-implicitarg-num-bytes"="56" "calls-enqueue-kernel" }
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" }
attributes #1 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" "runtime-handle"="__test_block_invoke_kernel_runtime_handle" }
attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" "calls-enqueue-kernel" }

!llvm.printf.fmts = !{!100, !101}

Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1866,9 +1866,9 @@ define amdgpu_kernel void @unknown_addrspace_kernarg(i32 addrspace(12345)* %ptr)
ret void
}

attributes #0 = { "amdgpu-implicitarg-num-bytes"="56" }
attributes #1 = { "amdgpu-implicitarg-num-bytes"="56" "runtime-handle"="__test_block_invoke_kernel_runtime_handle" }
attributes #2 = { "amdgpu-implicitarg-num-bytes"="56" "calls-enqueue-kernel" }
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" }
attributes #1 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" "runtime-handle"="__test_block_invoke_kernel_runtime_handle" }
attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" "calls-enqueue-kernel" }

!llvm.printf.fmts = !{!100, !101}

Expand Down
14 changes: 8 additions & 6 deletions llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll
Original file line number Diff line number Diff line change
Expand Up @@ -296,9 +296,11 @@ entry:
; CHECK-NEXT: - 1
; CHECK-NEXT: - 0

attributes #0 = { "amdgpu-implicitarg-num-bytes"="8" }
attributes #1 = { "amdgpu-implicitarg-num-bytes"="16" }
attributes #2 = { "amdgpu-implicitarg-num-bytes"="24" }
attributes #3 = { "amdgpu-implicitarg-num-bytes"="32" }
attributes #4 = { "amdgpu-implicitarg-num-bytes"="48" }
attributes #5 = { "amdgpu-implicitarg-num-bytes"="56" }
; We don't have a use of llvm.amdgcn.implicitarg.ptr, so optnone to
; avoid optimizing out the implicit argument allocation.
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="8" }
attributes #1 = { optnone noinline "amdgpu-implicitarg-num-bytes"="16" }
attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="24" }
attributes #3 = { optnone noinline "amdgpu-implicitarg-num-bytes"="32" }
attributes #4 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" }
attributes #5 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" }
14 changes: 8 additions & 6 deletions llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args.ll
Original file line number Diff line number Diff line change
Expand Up @@ -300,9 +300,11 @@ entry:
ret void
}

attributes #0 = { "amdgpu-implicitarg-num-bytes"="8" }
attributes #1 = { "amdgpu-implicitarg-num-bytes"="16" }
attributes #2 = { "amdgpu-implicitarg-num-bytes"="24" }
attributes #3 = { "amdgpu-implicitarg-num-bytes"="32" }
attributes #4 = { "amdgpu-implicitarg-num-bytes"="48" }
attributes #5 = { "amdgpu-implicitarg-num-bytes"="56" }
; We don't have a use of llvm.amdgcn.implicitarg.ptr, so optnone to
; avoid optimizing out the implicit argument allocation.
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="8" }
attributes #1 = { optnone noinline "amdgpu-implicitarg-num-bytes"="16" }
attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="24" }
attributes #3 = { optnone noinline "amdgpu-implicitarg-num-bytes"="32" }
attributes #4 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" }
attributes #5 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" }
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ define amdgpu_kernel void @test_kernel(i8 %a) #0
; CHECK-NEXT: - 1
; CHECK-NEXT: - 0

attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" }
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" }

!1 = !{i32 0}
!2 = !{!"none"}
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-absent.ll
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ define amdgpu_kernel void @test_kernel(i8 %a) #0
ret void
}

attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" }
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" }

!1 = !{i32 0}
!2 = !{!"none"}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ define amdgpu_kernel void @test_kernel(i8 %a) #0
; CHECK-NEXT: - 1
; CHECK-NEXT: - 0

attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" }
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" }

!1 = !{i32 0}
!2 = !{!"none"}
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present.ll
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ define amdgpu_kernel void @test_kernel(i8 %a) #0
ret void
}

attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" }
attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" }

!1 = !{i32 0}
!2 = !{!"none"}
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/indirect-call.ll
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) {
; GCN-NEXT: workitem_private_segment_byte_size = 16384
; GCN-NEXT: workgroup_group_segment_byte_size = 0
; GCN-NEXT: gds_segment_byte_size = 0
; GCN-NEXT: kernarg_segment_byte_size = 4
; GCN-NEXT: kernarg_segment_byte_size = 64
; GCN-NEXT: workgroup_fbarrier_count = 0
; GCN-NEXT: wavefront_sgpr_count = 37
; GCN-NEXT: workitem_vgpr_count = 32
Expand Down Expand Up @@ -151,7 +151,7 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) {
; GISEL-NEXT: workitem_private_segment_byte_size = 16384
; GISEL-NEXT: workgroup_group_segment_byte_size = 0
; GISEL-NEXT: gds_segment_byte_size = 0
; GISEL-NEXT: kernarg_segment_byte_size = 4
; GISEL-NEXT: kernarg_segment_byte_size = 64
; GISEL-NEXT: workgroup_fbarrier_count = 0
; GISEL-NEXT: wavefront_sgpr_count = 37
; GISEL-NEXT: workitem_vgpr_count = 32
Expand Down Expand Up @@ -249,7 +249,7 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) {
; GCN-NEXT: workitem_private_segment_byte_size = 16384
; GCN-NEXT: workgroup_group_segment_byte_size = 0
; GCN-NEXT: gds_segment_byte_size = 0
; GCN-NEXT: kernarg_segment_byte_size = 4
; GCN-NEXT: kernarg_segment_byte_size = 64
; GCN-NEXT: workgroup_fbarrier_count = 0
; GCN-NEXT: wavefront_sgpr_count = 37
; GCN-NEXT: workitem_vgpr_count = 32
Expand Down Expand Up @@ -343,7 +343,7 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) {
; GISEL-NEXT: workitem_private_segment_byte_size = 16384
; GISEL-NEXT: workgroup_group_segment_byte_size = 0
; GISEL-NEXT: gds_segment_byte_size = 0
; GISEL-NEXT: kernarg_segment_byte_size = 4
; GISEL-NEXT: kernarg_segment_byte_size = 64
; GISEL-NEXT: workgroup_fbarrier_count = 0
; GISEL-NEXT: wavefront_sgpr_count = 37
; GISEL-NEXT: workitem_vgpr_count = 32
Expand Down
42 changes: 21 additions & 21 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,15 @@
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MESA %s

; GCN-LABEL: {{^}}kernel_implicitarg_ptr_empty:
; HSA: enable_sgpr_kernarg_segment_ptr = 0
; HSA: kernarg_segment_byte_size = 0
; HSA: enable_sgpr_kernarg_segment_ptr = 1
; HSA: kernarg_segment_byte_size = 56
; HSA: kernarg_segment_alignment = 4

; MESA: enable_sgpr_kernarg_segment_ptr = 1
; MESA: kernarg_segment_byte_size = 16
; MESA: kernarg_segment_alignment = 4

; HSA: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0{{$}}
; HSA: s_load_dword s0, [[NULL]], 0x0
; HSA: s_load_dword s0, s[4:5], 0x0
define amdgpu_kernel void @kernel_implicitarg_ptr_empty() #0 {
%implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
%cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
Expand Down Expand Up @@ -59,7 +58,7 @@ define amdgpu_kernel void @opencl_kernel_implicitarg_ptr_empty() #1 {
; GCN-LABEL: {{^}}kernel_implicitarg_ptr:
; GCN: enable_sgpr_kernarg_segment_ptr = 1

; HSA: kernarg_segment_byte_size = 112
; HSA: kernarg_segment_byte_size = 168
; HSA: kernarg_segment_alignment = 4

; MESA: kernarg_segment_byte_size = 128
Expand Down Expand Up @@ -115,17 +114,17 @@ define void @opencl_func_implicitarg_ptr() #0 {
}

; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func_empty:
; HSA: enable_sgpr_kernarg_segment_ptr = 0
; HSA: kernarg_segment_byte_size = 0
; HSA: enable_sgpr_kernarg_segment_ptr = 1
; HSA: kernarg_segment_byte_size = 56
; HSA: kernarg_segment_alignment = 4

; MESA: enable_sgpr_kernarg_segment_ptr = 1
; MESA: kernarg_segment_byte_size = 16
; MESA: kernarg_segment_alignment = 4

; XGCN-NOT: s[4:5]
; XGCN-NOT: s4
; XGCN-NOT: s5
; GCN-NOT: s[4:5]
; GCN-NOT: s4
; GCN-NOT: s5
; GCN: s_swappc_b64
define amdgpu_kernel void @kernel_call_implicitarg_ptr_func_empty() #0 {
call void @func_implicitarg_ptr()
Expand Down Expand Up @@ -168,8 +167,9 @@ define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func_empty() #1 {

; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func:
; GCN: enable_sgpr_kernarg_segment_ptr = 1
; HSA: kernarg_segment_byte_size = 112
; HSA: kernarg_segment_byte_size = 168
; HSA: kernarg_segment_alignment = 4

; MESA: kernarg_segment_byte_size = 128
; MESA: kernarg_segment_alignment = 4

Expand Down Expand Up @@ -272,8 +272,8 @@ define amdgpu_kernel void @kernel_implicitarg_no_struct_align_padding(<16 x i32>
; HSA-LABEL: Kernels:
; HSA-LABEL: - Name: kernel_implicitarg_ptr_empty
; HSA: CodeProps:
; HSA: KernargSegmentSize: 0
; HSA: KernargSegmentAlign: 4
; HSA: KernargSegmentSize: 56
; HSA: KernargSegmentAlign: 8

; HSA-LABEL: - Name: kernel_implicitarg_ptr_empty_0implicit
; HSA: KernargSegmentSize: 0
Expand All @@ -284,16 +284,16 @@ define amdgpu_kernel void @kernel_implicitarg_no_struct_align_padding(<16 x i32>
; HSA: KernargSegmentAlign: 8

; HSA-LABEL: - Name: kernel_implicitarg_ptr
; HSA: KernargSegmentSize: 112
; HSA: KernargSegmentAlign: 4
; HSA: KernargSegmentSize: 168
; HSA: KernargSegmentAlign: 8

; HSA-LABEL: - Name: opencl_kernel_implicitarg_ptr
; HSA: KernargSegmentSize: 160
; HSA: KernargSegmentAlign: 8

; HSA-LABEL: - Name: kernel_call_implicitarg_ptr_func_empty
; HSA: KernargSegmentSize: 0
; HSA: KernargSegmentAlign: 4
; HSA: KernargSegmentSize: 56
; HSA: KernargSegmentAlign: 8

; HSA-LABEL: - Name: kernel_call_implicitarg_ptr_func_empty_implicit0
; HSA: KernargSegmentSize: 0
Expand All @@ -304,16 +304,16 @@ define amdgpu_kernel void @kernel_implicitarg_no_struct_align_padding(<16 x i32>
; HSA: KernargSegmentAlign: 8

; HSA-LABEL: - Name: kernel_call_implicitarg_ptr_func
; HSA: KernargSegmentSize: 112
; HSA: KernargSegmentAlign: 4
; HSA: KernargSegmentSize: 168
; HSA: KernargSegmentAlign: 8

; HSA-LABEL: - Name: opencl_kernel_call_implicitarg_ptr_func
; HSA: KernargSegmentSize: 160
; HSA: KernargSegmentAlign: 8

; HSA-LABEL: - Name: kernel_call_kernarg_implicitarg_ptr_func
; HSA: KernargSegmentSize: 112
; HSA: KernargSegmentAlign: 4
; HSA: KernargSegmentSize: 168
; HSA: KernargSegmentAlign: 8

; HSA-LABEL: - Name: kernel_implicitarg_no_struct_align_padding
; HSA: KernargSegmentSize: 120
Expand Down
12 changes: 4 additions & 8 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ define amdgpu_kernel void @test(i32 addrspace(1)* %out) #1 {
}

; ALL-LABEL: {{^}}test_implicit:
; HSA: kernarg_segment_byte_size = 8
; HSA: kernarg_segment_byte_size = 64
; OS-MESA3D: kernarg_segment_byte_size = 24
; CO-V2: kernarg_segment_alignment = 4

Expand All @@ -36,7 +36,7 @@ define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 {
}

; ALL-LABEL: {{^}}test_implicit_alignment:
; HSA: kernarg_segment_byte_size = 12
; HSA: kernarg_segment_byte_size = 72
; OS-MESA3D: kernarg_segment_byte_size = 28
; CO-V2: kernarg_segment_alignment = 4

Expand Down Expand Up @@ -75,14 +75,10 @@ define amdgpu_kernel void @opencl_test_implicit_alignment(i32 addrspace(1)* %out
ret void
}

; Mesa implies 16-bytes are always allocated, hsa requires the
; attribute for the additional space.
; ALL-LABEL: {{^}}test_no_kernargs:
; HSA: enable_sgpr_kernarg_segment_ptr = 0
; HSA: kernarg_segment_byte_size = 0
; CO-V2: enable_sgpr_kernarg_segment_ptr = 0
; CO-V2: kernarg_segment_byte_size = 0

; OS-MESA3D: enable_sgpr_kernarg_segment_ptr = 1
; OS-MESA3D: kernarg_segment_byte_size = 16
; CO-V2: kernarg_segment_alignment = 4

; HSA: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0{{$}}
Expand Down
243 changes: 122 additions & 121 deletions llvm/test/CodeGen/AMDGPU/lower-kernargs.ll

Large diffs are not rendered by default.