Skip to content

Commit

Permalink
AMDGPU: Don't handle kernarg.segment.ptr in functions
Browse files Browse the repository at this point in the history
Just lower this to null. Pass implicitarg.ptr in its place in the
argument list.
  • Loading branch information
arsenm committed Mar 13, 2020
1 parent f82b32a commit bb86220
Show file tree
Hide file tree
Showing 7 changed files with 66 additions and 56 deletions.
17 changes: 11 additions & 6 deletions llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
Expand Up @@ -216,7 +216,7 @@ static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
"amdgpu-work-item-id-z", "amdgpu-work-group-id-x",
"amdgpu-work-group-id-y", "amdgpu-work-group-id-z",
"amdgpu-dispatch-ptr", "amdgpu-dispatch-id",
"amdgpu-kernarg-segment-ptr", "amdgpu-implicitarg-ptr"};
"amdgpu-implicitarg-ptr"};

if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
NeedQueuePtr = true;
Expand Down Expand Up @@ -305,11 +305,16 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
Changed = true;
} else {
bool NonKernelOnly = false;
StringRef AttrName = intrinsicToAttrName(IID,
NonKernelOnly, NeedQueuePtr);
if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
F.addFnAttr(AttrName);
Changed = true;

if (!IsFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) {
F.addFnAttr("amdgpu-kernarg-segment-ptr");
} else {
StringRef AttrName = intrinsicToAttrName(IID, NonKernelOnly,
NeedQueuePtr);
if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
F.addFnAttr(AttrName);
Changed = true;
}
}
}
}
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Expand Up @@ -3747,6 +3747,14 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
return false;
}
case Intrinsic::amdgcn_kernarg_segment_ptr:
if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
B.setInstr(MI);
// This only makes sense to call in a kernel, so just lower to null.
B.buildConstant(MI.getOperand(0).getReg(), 0);
MI.eraseFromParent();
return true;
}

return legalizePreloadedArgIntrin(
MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
case Intrinsic::amdgcn_implicitarg_ptr:
Expand Down
19 changes: 11 additions & 8 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Expand Up @@ -1724,8 +1724,10 @@ void SITargetLowering::allocateSpecialInputSGPRs(
if (Info.hasQueuePtr())
ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);

if (Info.hasKernargSegmentPtr())
ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
// Implicit arg ptr takes the place of the kernarg segment pointer. This is a
// constant offset from the kernarg segment.
if (Info.hasImplicitArgPtr())
ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);

if (Info.hasDispatchID())
ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
Expand All @@ -1740,9 +1742,6 @@ void SITargetLowering::allocateSpecialInputSGPRs(

if (Info.hasWorkGroupIDZ())
ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);

if (Info.hasImplicitArgPtr())
ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
}

// Allocate special inputs passed in user SGPRs.
Expand Down Expand Up @@ -2448,12 +2447,11 @@ void SITargetLowering::passSpecialInputs(
AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
AMDGPUFunctionArgInfo::DISPATCH_PTR,
AMDGPUFunctionArgInfo::QUEUE_PTR,
AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR,
AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR,
AMDGPUFunctionArgInfo::DISPATCH_ID,
AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
};

for (auto InputID : InputRegs) {
Expand Down Expand Up @@ -5735,6 +5733,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
}
case Intrinsic::amdgcn_kernarg_segment_ptr: {
if (!AMDGPU::isKernel(MF.getFunction().getCallingConv())) {
// This only makes sense to call in a kernel, so just lower to null.
return DAG.getConstant(0, DL, VT);
}

return getPreloadedValue(DAG, *MFI, VT,
AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
}
Expand Down
Expand Up @@ -114,6 +114,16 @@ define amdgpu_kernel void @opencl_test_implicit_alignment_no_explicit_kernargs_r
ret void
}

; ALL-LABEL: {{^}}func_kernarg_segment_ptr:
; ALL: s_mov_b32 [[S_LO:s[0-9]+]], 0{{$}}
; ALL: s_mov_b32 [[S_HI:s[0-9]+]], 0{{$}}
; ALL: v_mov_b32_e32 v0, [[S_LO]]{{$}}
; ALL: v_mov_b32_e32 v1, [[S_HI]]{{$}}
define i8 addrspace(4)* @func_kernarg_segment_ptr() {
%ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
ret i8 addrspace(4)* %ptr
}

declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0

Expand Down
Expand Up @@ -231,7 +231,7 @@ define void @use_kernarg_segment_ptr() #1 {
ret void
}

; HSA: define void @func_indirect_use_kernarg_segment_ptr() #14 {
; HSA: define void @func_indirect_use_kernarg_segment_ptr() #11 {
define void @func_indirect_use_kernarg_segment_ptr() #1 {
call void @use_kernarg_segment_ptr()
ret void
Expand Down
29 changes: 13 additions & 16 deletions llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
Expand Up @@ -66,10 +66,10 @@ define amdgpu_kernel void @kern_indirect_use_queue_ptr_addrspacecast(i32) #1 {
ret void
}

; Not really supported in callable functions.
; GCN-LABEL: {{^}}use_kernarg_segment_ptr:
; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4
; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5
; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
; GCN: s_mov_b64 [[PTR:s\[[0-9]+:[0-9]+\]]], 0{{$}}
; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], 0x0{{$}}
define hidden void @use_kernarg_segment_ptr() #1 {
%kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
%header_ptr = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)*
Expand All @@ -79,10 +79,6 @@ define hidden void @use_kernarg_segment_ptr() #1 {

; GCN-LABEL: {{^}}kern_indirect_use_kernarg_segment_ptr:
; GCN: enable_sgpr_kernarg_segment_ptr = 1
; GCN-NOT: s[4:5]
; GCN-NOT: s4
; GCN-NOT: s5
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_kernarg_segment_ptr(i32) #1 {
call void @use_kernarg_segment_ptr()
ret void
Expand Down Expand Up @@ -437,9 +433,9 @@ define hidden void @use_every_sgpr_input() #1 {
%queue_ptr.bc = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)*
%val1 = load volatile i32, i32 addrspace(4)* %queue_ptr.bc

%kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
%kernarg_segment_ptr.bc = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)*
%val2 = load volatile i32, i32 addrspace(4)* %kernarg_segment_ptr.bc
%implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0
%implicitarg.ptr.bc = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
%val2 = load volatile i32, i32 addrspace(4)* %implicitarg.ptr.bc

%val3 = call i64 @llvm.amdgcn.dispatch.id()
call void asm sideeffect "; use $0", "s"(i64 %val3)
Expand Down Expand Up @@ -521,9 +517,9 @@ define hidden void @func_use_every_sgpr_input_call_use_workgroup_id_xyz() #1 {
%queue_ptr.bc = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)*
%val1 = load volatile i32, i32 addrspace(4)* %queue_ptr.bc

%kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
%kernarg_segment_ptr.bc = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)*
%val2 = load volatile i32, i32 addrspace(4)* %kernarg_segment_ptr.bc
%implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0
%implicitarg.ptr.bc = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
%val2 = load volatile i32, i32 addrspace(4)* %implicitarg.ptr.bc

%val3 = call i64 @llvm.amdgcn.dispatch.id()
call void asm sideeffect "; use $0", "s"(i64 %val3)
Expand Down Expand Up @@ -590,9 +586,9 @@ define hidden void @func_use_every_sgpr_input_call_use_workgroup_id_xyz_spill()
%queue_ptr.bc = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)*
%val1 = load volatile i32, i32 addrspace(4)* %queue_ptr.bc

%kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
%kernarg_segment_ptr.bc = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)*
%val2 = load volatile i32, i32 addrspace(4)* %kernarg_segment_ptr.bc
%implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0
%implicitarg.ptr.bc = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
%val2 = load volatile i32, i32 addrspace(4)* %implicitarg.ptr.bc

%val3 = call i64 @llvm.amdgcn.dispatch.id()
call void asm sideeffect "; use $0", "s"(i64 %val3)
Expand All @@ -614,6 +610,7 @@ declare i32 @llvm.amdgcn.workgroup.id.y() #0
declare i32 @llvm.amdgcn.workgroup.id.z() #0
declare noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
declare noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
declare noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0
declare i64 @llvm.amdgcn.dispatch.id() #0
declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0

Expand Down
37 changes: 12 additions & 25 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
Expand Up @@ -165,18 +165,13 @@ define void @opencl_func_call_implicitarg_ptr_func() #0 {

; GCN-LABEL: {{^}}func_kernarg_implicitarg_ptr:
; GCN: s_waitcnt
; MESA-DAG: v_mov_b32_e32 v0, s4
; MESA-DAG: v_mov_b32_e32 v1, s5
; MESA-DAG: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; MESA: v_mov_b32_e32 v0, s6
; MESA: v_mov_b32_e32 v1, s7
; GCN-DAG: v_mov_b32_e32 v0, s4
; GCN-DAG: v_mov_b32_e32 v1, s5
; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0
; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0

; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64

; HSA: v_mov_b32_e32 v0, s4
; HSA: v_mov_b32_e32 v1, s5
; HSA: flat_load_dword v0, v[0:1]
; HSA: v_mov_b32_e32 v0, s6
; HSA: v_mov_b32_e32 v1, s7
; HSA: flat_load_dword v0, v[0:1]

; GCN: s_waitcnt vmcnt(0)
Expand All @@ -192,20 +187,12 @@ define void @func_kernarg_implicitarg_ptr() #0 {

; GCN-LABEL: {{^}}opencl_func_kernarg_implicitarg_ptr:
; GCN: s_waitcnt
; MESA-DAG: v_mov_b32_e32 v0, s4
; MESA-DAG: v_mov_b32_e32 v1, s5
; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; MESA-DAG: v_mov_b32_e32 v0, s6
; MESA-DAG: v_mov_b32_e32 v1, s7
; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64

; GCN-DAG: v_mov_b32_e32 v0, s4
; GCN-DAG: v_mov_b32_e32 v1, s5
; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0
; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0

; HSA: v_mov_b32_e32 v0, s4
; HSA: v_mov_b32_e32 v1, s5
; HSA: flat_load_dword v0, v[0:1]

; HSA: v_mov_b32_e32 v0, s6
; HSA: v_mov_b32_e32 v1, s7
; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; HSA: flat_load_dword v0, v[0:1]

; GCN: s_waitcnt vmcnt(0)
Expand All @@ -220,8 +207,8 @@ define void @opencl_func_kernarg_implicitarg_ptr() #0 {
}

; GCN-LABEL: {{^}}kernel_call_kernarg_implicitarg_ptr_func:
; GCN: s_add_u32 s6, s4, 0x70
; GCN: s_addc_u32 s7, s5, 0
; GCN: s_add_u32 s4, s4, 0x70
; GCN: s_addc_u32 s5, s5, 0
; GCN: s_swappc_b64
define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8]) #0 {
call void @func_kernarg_implicitarg_ptr()
Expand Down

0 comments on commit bb86220

Please sign in to comment.