Skip to content

Commit

Permalink
[NVPTX] Improve lowering of byval args of device functions.
Browse files Browse the repository at this point in the history
Avoid unnecessary spills of byval arguments of device functions to
local space on SASS level and subsequent pointer conversion to generic
address space that follows. Instead, make a local copy in IR, provide
a way to access arguments directly, and let LLVM optimize the copy away
when possible.

Differential Review: https://reviews.llvm.org/D21421

llvm-svn: 276153
  • Loading branch information
Artem-B committed Jul 20, 2016
1 parent 3b24b80 commit b2e76a5
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 40 deletions.
11 changes: 6 additions & 5 deletions llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5076,11 +5076,12 @@ bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
Address = N.getOperand(0);
return true;
}
if (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
unsigned IID = cast<ConstantSDNode>(N.getOperand(0))->getZExtValue();
if (IID == Intrinsic::nvvm_ptr_gen_to_param)
if (N.getOperand(1).getOpcode() == NVPTXISD::MoveParam)
return (SelectDirectAddr(N.getOperand(1).getOperand(0), Address));
// addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) {
if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM &&
CastN->getOperand(0).getOpcode() == NVPTXISD::MoveParam)
return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address);
}
return false;
}
Expand Down
13 changes: 3 additions & 10 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2077,7 +2077,6 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
SDValue Root = DAG.getRoot();
std::vector<SDValue> OutChains;

bool isKernel = llvm::isKernelFunction(*F);
bool isABI = (STI.getSmVersion() >= 20);
assert(isABI && "Non-ABI compilation is not supported");
if (!isABI)
Expand Down Expand Up @@ -2111,7 +2110,8 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
theArgs[i],
(theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
: nullptr))) {
assert(isKernel && "Only kernels can have image/sampler params");
assert(llvm::isKernelFunction(*F) &&
"Only kernels can have image/sampler params");
InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
continue;
}
Expand Down Expand Up @@ -2336,14 +2336,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
if (p.getNode())
p.getNode()->setIROrder(idx + 1);
if (isKernel)
InVals.push_back(p);
else {
SDValue p2 = DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, dl, ObjectVT,
DAG.getConstant(Intrinsic::nvvm_ptr_local_to_gen, dl, MVT::i32), p);
InVals.push_back(p2);
}
InVals.push_back(p);
}

// Clang will check explicit VarArg and issue error if any. However, Clang
Expand Down
53 changes: 36 additions & 17 deletions llvm/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,28 @@
//
//===----------------------------------------------------------------------===//
//
// Pointer arguments to kernel functions need to be lowered specially.
//
// 1. Copy byval struct args to local memory. This is a preparation for handling
// cases like
// Arguments to kernel and device functions are passed via param space,
// which imposes certain restrictions:
// http://docs.nvidia.com/cuda/parallel-thread-execution/#state-spaces
//
// kernel void foo(struct A arg, ...)
// {
// struct A *p = &arg;
// ...
// ... = p->filed1 ... (this is no generic address for .param)
// p->filed2 = ... (this is no write access to .param)
// }
// Kernel parameters are read-only and accessible only via ld.param
// instruction, directly or via a pointer. Pointers to kernel
// arguments can't be converted to generic address space.
//
// Device function parameters are directly accessible via
// ld.param/st.param, but taking the address of one returns a pointer
// to a copy created in local space which *can't* be used with
// ld.param/st.param.
//
// Copying a byval struct into local memory in IR allows us to enforce
// the param space restrictions, gives the rest of IR a pointer w/o
// param space restrictions, and gives us an opportunity to eliminate
// the copy.
//
// 2. Convert non-byval pointer arguments of CUDA kernels to pointers in the
// Pointer arguments to kernel functions need more work to be lowered:
//
// 1. Convert non-byval pointer arguments of CUDA kernels to pointers in the
// global address space. This allows later optimizations to emit
// ld.global.*/st.global.* for accessing these pointer arguments. For
// example,
Expand All @@ -47,7 +55,7 @@
// ...
// }
//
// 3. Convert pointers in a byval kernel parameter to pointers in the global
// 2. Convert pointers in a byval kernel parameter to pointers in the global
// address space. As #2, it allows NVPTX to emit more ld/st.global. E.g.,
//
// struct S {
Expand Down Expand Up @@ -101,6 +109,9 @@ namespace {
class NVPTXLowerKernelArgs : public FunctionPass {
bool runOnFunction(Function &F) override;

bool runOnKernelFunction(Function &F);
bool runOnDeviceFunction(Function &F);

// handle byval parameters
void handleByValParam(Argument *Arg);
// Knowing Ptr must point to the global address space, this function
Expand Down Expand Up @@ -192,11 +203,7 @@ void NVPTXLowerKernelArgs::markPointerAsGlobal(Value *Ptr) {
// =============================================================================
// Main function for this pass.
// =============================================================================
bool NVPTXLowerKernelArgs::runOnFunction(Function &F) {
// Skip non-kernels. See the comments at the top of this file.
if (!isKernelFunction(F))
return false;

bool NVPTXLowerKernelArgs::runOnKernelFunction(Function &F) {
if (TM && TM->getDrvInterface() == NVPTX::CUDA) {
// Mark pointers in byval structs as global.
for (auto &B : F) {
Expand Down Expand Up @@ -228,6 +235,18 @@ bool NVPTXLowerKernelArgs::runOnFunction(Function &F) {
return true;
}

// Device functions only need to copy byval args into local memory.
bool NVPTXLowerKernelArgs::runOnDeviceFunction(Function &F) {
for (Argument &Arg : F.args())
if (Arg.getType()->isPointerTy() && Arg.hasByValAttr())
handleByValParam(&Arg);
return true;
}

bool NVPTXLowerKernelArgs::runOnFunction(Function &F) {
return isKernelFunction(F) ? runOnKernelFunction(F) : runOnDeviceFunction(F);
}

FunctionPass *
llvm::createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM) {
return new NVPTXLowerKernelArgs(TM);
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/NVPTX/bug21465.ll
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ entry:
%b = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
%0 = load i32, i32* %b, align 4
; PTX-NOT: ld.param.u32 {{%r[0-9]+}}, [{{%rd[0-9]+}}]
; PTX: ld.param.u32 [[value:%r[0-9]+]], [{{%rd[0-9]+}}+4]
; PTX: ld.param.u32 [[value:%r[0-9]+]], [_Z11TakesStruct1SPi_param_0+4]
store i32 %0, i32* %output, align 4
; PTX-NEXT: st.global.u32 [{{%rd[0-9]+}}], [[value]]
ret void
Expand Down
32 changes: 25 additions & 7 deletions llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
Original file line number Diff line number Diff line change
Expand Up @@ -28,20 +28,38 @@ define void @kernel2(float addrspace(1)* %input, float addrspace(1)* %output) {

%struct.S = type { i32*, i32* }

define void @ptr_in_byval(%struct.S* byval %input, i32* %output) {
; CHECK-LABEL: .visible .entry ptr_in_byval(
; CHECK: cvta.to.global.u64
; CHECK: cvta.to.global.u64
define void @ptr_in_byval_kernel(%struct.S* byval %input, i32* %output) {
; CHECK-LABEL: .visible .entry ptr_in_byval_kernel(
; CHECK: ld.param.u64 %[[optr:rd.*]], [ptr_in_byval_kernel_param_1]
; CHECK: cvta.to.global.u64 %[[optr_g:.*]], %[[optr]];
; CHECK: ld.param.u64 %[[iptr:rd.*]], [ptr_in_byval_kernel_param_0+8]
; CHECK: cvta.to.global.u64 %[[iptr_g:.*]], %[[iptr]];
%b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
%b = load i32*, i32** %b_ptr, align 4
%v = load i32, i32* %b, align 4
; CHECK: ld.global.u32 %[[val:.*]], [%[[iptr_g]]]
store i32 %v, i32* %output, align 4
; CHECK: st.global.u32 [%[[optr_g]]], %[[val]]
ret void
}

; Regular functions lower byval arguments differently. We need to make
; sure that we're loading byval argument data using [symbol+offset].
; There's also no assumption that all pointers within are in global space.
define void @ptr_in_byval_func(%struct.S* byval %input, i32* %output) {
; CHECK-LABEL: .visible .func ptr_in_byval_func(
; CHECK: ld.param.u64 %[[optr:rd.*]], [ptr_in_byval_func_param_1]
; CHECK: ld.param.u64 %[[iptr:rd.*]], [ptr_in_byval_func_param_0+8]
%b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
%b = load i32*, i32** %b_ptr, align 4
%v = load i32, i32* %b, align 4
; CHECK: ld.global.u32
; CHECK: ld.u32 %[[val:.*]], [%[[iptr]]]
store i32 %v, i32* %output, align 4
; CHECK: st.global.u32
; CHECK: st.u32 [%[[optr]]], %[[val]]
ret void
}

!nvvm.annotations = !{!0, !1, !2}
!0 = !{void (float*, float*)* @kernel, !"kernel", i32 1}
!1 = !{void (float addrspace(1)*, float addrspace(1)*)* @kernel2, !"kernel", i32 1}
!2 = !{void (%struct.S*, i32*)* @ptr_in_byval, !"kernel", i32 1}
!2 = !{void (%struct.S*, i32*)* @ptr_in_byval_kernel, !"kernel", i32 1}

0 comments on commit b2e76a5

Please sign in to comment.