Skip to content

Commit

Permalink
[NVPTX] Add lowering of i128 params.
Browse files Browse the repository at this point in the history
The patch adds support of i128 params lowering. The changes are quite trivial to
support i128 as a "special case" of integer type. With this patch, we lower i128
params the same way as aggregates of size 16 bytes: .param .b8 _ [16].

Currently, NVPTX can't deal with the 128 bit integers:
* in some cases because of failed assertions like
  ValVTs.size() == OutVals.size() && "Bad return value decomposition"
* in other cases emitting PTX with .i128 or .u128 types (which are not valid [1])
  [1] http://docs.nvidia.com/cuda/parallel-thread-execution/index.html#fundamental-types

Differential Revision: https://reviews.llvm.org/D34555
Patch by: Denys Zariaiev (denys.zariaiev@gmail.com)

llvm-svn: 308675
  • Loading branch information
Artem-B committed Jul 20, 2017
1 parent e5456ce commit d7a7382
Show file tree
Hide file tree
Showing 8 changed files with 129 additions and 15 deletions.
4 changes: 2 additions & 2 deletions clang/lib/Basic/Targets.cpp
Expand Up @@ -1833,9 +1833,9 @@ class NVPTXTargetInfo : public TargetInfo {
GPU = CudaArch::SM_20;

if (TargetPointerWidth == 32)
resetDataLayout("e-p:32:32-i64:64-v16:16-v32:32-n16:32:64");
resetDataLayout("e-p:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
else
resetDataLayout("e-i64:64-v16:16-v32:32-n16:32:64");
resetDataLayout("e-i64:64-i128:128-v16:16-v32:32-n16:32:64");

// If possible, get a TargetInfo for our host triple, so we can match its
// types.
Expand Down
4 changes: 2 additions & 2 deletions clang/test/CodeGen/target-data.c
Expand Up @@ -116,11 +116,11 @@

// RUN: %clang_cc1 -triple nvptx-unknown -o - -emit-llvm %s | \
// RUN: FileCheck %s -check-prefix=NVPTX
// NVPTX: target datalayout = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64"
// NVPTX: target datalayout = "e-p:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"

// RUN: %clang_cc1 -triple nvptx64-unknown -o - -emit-llvm %s | \
// RUN: FileCheck %s -check-prefix=NVPTX64
// NVPTX64: target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
// NVPTX64: target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"

// RUN: %clang_cc1 -triple r600-unknown -o - -emit-llvm %s | \
// RUN: FileCheck %s -check-prefix=R600
Expand Down
14 changes: 11 additions & 3 deletions llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
Expand Up @@ -400,7 +400,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
O << " (";

if (isABI) {
if (Ty->isFloatingPointTy() || Ty->isIntegerTy()) {
if (Ty->isFloatingPointTy() || (Ty->isIntegerTy() && !Ty->isIntegerTy(128))) {
unsigned size = 0;
if (auto *ITy = dyn_cast<IntegerType>(Ty)) {
size = ITy->getBitWidth();
Expand All @@ -418,7 +418,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
} else if (isa<PointerType>(Ty)) {
O << ".param .b" << TLI->getPointerTy(DL).getSizeInBits()
<< " func_retval0";
} else if (Ty->isAggregateType() || Ty->isVectorTy()) {
} else if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
unsigned totalsz = DL.getTypeAllocSize(Ty);
unsigned retAlignment = 0;
if (!getAlign(*F, 0, retAlignment))
Expand Down Expand Up @@ -1425,6 +1425,14 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
else
O << " .align " << GVar->getAlignment();

// Special case for i128
if (ETy->isIntegerTy(128)) {
O << " .b8 ";
getSymbol(GVar)->print(O, MAI);
O << "[16]";
return;
}

if (ETy->isFloatingPointTy() || ETy->isIntegerTy() || ETy->isPointerTy()) {
O << " .";
O << getPTXFundamentalTypeStr(ETy);
Expand Down Expand Up @@ -1551,7 +1559,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
}

if (!PAL.hasParamAttribute(paramIndex, Attribute::ByVal)) {
if (Ty->isAggregateType() || Ty->isVectorTy()) {
if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
// Just print .param .align <a> .b8 .param[size];
// <a> = PAL.getparamalignment
// size = typeallocsize of element type
Expand Down
27 changes: 20 additions & 7 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Expand Up @@ -168,6 +168,19 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
SmallVector<EVT, 16> TempVTs;
SmallVector<uint64_t, 16> TempOffsets;

// Special case for i128 - decompose to (i64, i64)
if (Ty->isIntegerTy(128)) {
ValueVTs.push_back(EVT(MVT::i64));
ValueVTs.push_back(EVT(MVT::i64));

if (Offsets) {
Offsets->push_back(StartingOffset + 0);
Offsets->push_back(StartingOffset + 8);
}

return;
}

ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
EVT VT = TempVTs[i];
Expand Down Expand Up @@ -1262,7 +1275,7 @@ std::string NVPTXTargetLowering::getPrototype(
O << "()";
} else {
O << "(";
if (retTy->isFloatingPointTy() || retTy->isIntegerTy()) {
if (retTy->isFloatingPointTy() || (retTy->isIntegerTy() && !retTy->isIntegerTy(128))) {
unsigned size = 0;
if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
size = ITy->getBitWidth();
Expand All @@ -1280,7 +1293,7 @@ std::string NVPTXTargetLowering::getPrototype(
O << ".param .b" << size << " _";
} else if (isa<PointerType>(retTy)) {
O << ".param .b" << PtrVT.getSizeInBits() << " _";
} else if (retTy->isAggregateType() || retTy->isVectorTy()) {
} else if (retTy->isAggregateType() || retTy->isVectorTy() || retTy->isIntegerTy(128)) {
auto &DL = CS->getCalledFunction()->getParent()->getDataLayout();
O << ".param .align " << retAlignment << " .b8 _["
<< DL.getTypeAllocSize(retTy) << "]";
Expand All @@ -1302,7 +1315,7 @@ std::string NVPTXTargetLowering::getPrototype(
first = false;

if (!Outs[OIdx].Flags.isByVal()) {
if (Ty->isAggregateType() || Ty->isVectorTy()) {
if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
unsigned align = 0;
const CallInst *CallI = cast<CallInst>(CS->getInstruction());
// +1 because index 0 is reserved for return type alignment
Expand Down Expand Up @@ -1458,7 +1471,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
unsigned AllocSize = DL.getTypeAllocSize(Ty);
SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
bool NeedAlign; // Does argument declaration specify alignment?
if (Ty->isAggregateType() || Ty->isVectorTy()) {
if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
// declare .param .align <align> .b8 .param<n>[<size>];
SDValue DeclareParamOps[] = {
Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
Expand Down Expand Up @@ -1634,8 +1647,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// these three types to match the logic in
// NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
// Plus, this behavior is consistent with nvcc's.
if (RetTy->isFloatingPointTy() || RetTy->isIntegerTy() ||
RetTy->isPointerTy()) {
if (RetTy->isFloatingPointTy() || RetTy->isPointerTy() ||
(RetTy->isIntegerTy() && !RetTy->isIntegerTy(128))) {
// Scalar needs to be at least 32bit wide
if (resultsz < 32)
resultsz = 32;
Expand Down Expand Up @@ -2366,7 +2379,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(

if (theArgs[i]->use_empty()) {
// argument is dead
if (Ty->isAggregateType()) {
if (Ty->isAggregateType() || Ty->isIntegerTy(128)) {
SmallVector<EVT, 16> vtparts;

ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
Expand Up @@ -81,7 +81,7 @@ static std::string computeDataLayout(bool is64Bit) {
if (!is64Bit)
Ret += "-p:32:32";

Ret += "-i64:64-v16:16-v32:32-n16:32:64";
Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64";

return Ret;
}
Expand Down
7 changes: 7 additions & 0 deletions llvm/test/CodeGen/NVPTX/i128-global.ll
@@ -0,0 +1,7 @@
; RUN: llc < %s -O0 -march=nvptx64 -mcpu=sm_20 | FileCheck %s

; CHECK: .visible .global .align 16 .b8 G1[16] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
@G1 = global i128 1

; CHECK: .visible .global .align 16 .b8 G2[16];
@G2 = global i128 0
58 changes: 58 additions & 0 deletions llvm/test/CodeGen/NVPTX/i128-param.ll
@@ -0,0 +1,58 @@
; RUN: llc < %s -O0 -march=nvptx -mcpu=sm_20 | FileCheck %s

; CHECK-LABEL: .visible .func callee(
; CHECK-NEXT: .param .align 16 .b8 callee_param_0[16],
; CHECK-NEXT: .param .align 16 .b8 callee_param_1[16],
define void @callee(i128, i128, i128*) {
; CHECK-DAG: ld.param.v2.u64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [callee_param_0];
; CHECK-DAG: ld.param.v2.u64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [callee_param_1];

; CHECK: mul.lo.s64 %[[REG4:rd[0-9]+]], %[[REG0]], %[[REG3]];
; CHECK-NEXT: mul.hi.u64 %[[REG5:rd[0-9]+]], %[[REG0]], %[[REG2]];
; CHECK-NEXT: add.s64 %[[REG6:rd[0-9]+]], %[[REG5]], %[[REG4]];
; CHECK-NEXT: mul.lo.s64 %[[REG7:rd[0-9]+]], %[[REG1]], %[[REG2]];
; CHECK-NEXT: add.s64 %[[REG8:rd[0-9]+]], %[[REG6]], %[[REG7]];
; CHECK-NEXT: mul.lo.s64 %[[REG9:rd[0-9]+]], %[[REG0]], %[[REG2]];
%a = mul i128 %0, %1

store i128 %a, i128* %2
ret void
}

; CHECK-LABEL: .visible .entry caller_kernel(
; CHECK-NEXT: .param .align 16 .b8 caller_kernel_param_0[16],
; CHECK-NEXT: .param .align 16 .b8 caller_kernel_param_1[16],
define ptx_kernel void @caller_kernel(i128, i128, i128*) {
start:
; CHECK-DAG: ld.param.v2.u64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [caller_kernel_param_0];
; CHECK-DAG: ld.param.v2.u64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [caller_kernel_param_1];

; CHECK: { // callseq [[CALLSEQ_ID:[0-9]]], 0
; CHECK: .param .align 16 .b8 param0[16];
; CHECK-NEXT: st.param.v2.b64 [param0+0], {%[[REG0]], %[[REG1]]}
; CHECK: .param .align 16 .b8 param1[16];
; CHECK-NEXT: st.param.v2.b64 [param1+0], {%[[REG2]], %[[REG3]]}
; CHECK: } // callseq [[CALLSEQ_ID]]
call void @callee(i128 %0, i128 %1, i128* %2)

ret void
}

; CHECK-LABEL: .visible .func caller_func(
; CHECK-NEXT: .param .align 16 .b8 caller_func_param_0[16],
; CHECK-NEXT: .param .align 16 .b8 caller_func_param_1[16],
define void @caller_func(i128, i128, i128*) {
start:
; CHECK-DAG: ld.param.v2.u64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [caller_func_param_0]
; CHECK-DAG: ld.param.v2.u64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [caller_func_param_1]

; CHECK: { // callseq [[CALLSEQ_ID:[0-9]]], 0
; CHECK: .param .align 16 .b8 param0[16];
; CHECK: st.param.v2.b64 [param0+0], {%[[REG0]], %[[REG1]]}
; CHECK: .param .align 16 .b8 param1[16];
; CHECK: st.param.v2.b64 [param1+0], {%[[REG2]], %[[REG3]]}
; CHECK: } // callseq [[CALLSEQ_ID]]
call void @callee(i128 %0, i128 %1, i128* %2)

ret void
}
28 changes: 28 additions & 0 deletions llvm/test/CodeGen/NVPTX/i128-retval.ll
@@ -0,0 +1,28 @@
; RUN: llc < %s -O0 -march=nvptx64 -mcpu=sm_20 | FileCheck %s

; CHECK-LABEL: .visible .func (.param .align 16 .b8 func_retval0[16]) callee(
define i128 @callee(i128) {
; CHECK: ld.param.v2.u64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [callee_param_0];
; CHECK: st.param.v2.b64 [func_retval0+0], {%[[REG0]], %[[REG1]]}
ret i128 %0
}

; CHECK-LABEL: .visible .func caller(
define void @caller(i128, i128*) {
start:
; CHECK-DAG: ld.param.v2.u64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [caller_param_0];
; CHECK-DAG: ld.param.u64 %[[OUT:rd[0-9]+]], [caller_param_1];

; CHECK: { // callseq 0, 0
; CHECK: .param .align 16 .b8 retval0[16];
; CHECK: call.uni (retval0),
; CHECK: ld.param.v2.b64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [retval0+0];
; CHECK: } // callseq 0
%a = call i128 @callee(i128 %0)

; CHECK-DAG: st.u64 [%[[OUT]]], %[[REG2]];
; CHECK-DAG: st.u64 [%[[OUT]]+8], %[[REG3]];
store i128 %a, i128* %1

ret void
}

0 comments on commit d7a7382

Please sign in to comment.