Skip to content

Commit

Permalink
Revert "[NVPTX] Enhance vectorization of ld.param & st.param"
Browse files Browse the repository at this point in the history
This reverts commit f854434.

Placed URL to wrong differential revision in commit message.
  • Loading branch information
kovdan01 committed Mar 24, 2022
1 parent 3642baf commit a034878
Show file tree
Hide file tree
Showing 7 changed files with 55 additions and 1,544 deletions.
31 changes: 9 additions & 22 deletions clang/test/CodeGenCUDA/device-fun-linkage.cu
@@ -1,32 +1,19 @@
// RUN: %clang_cc1 -triple nvptx -fcuda-is-device -emit-llvm -o - %s \
// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
// RUN: -emit-llvm -o - %s \
// RUN: | FileCheck -check-prefix=NORDC %s
// RUN: %clang_cc1 -triple nvptx -fcuda-is-device -emit-llvm -o - %s \
// RUN: | FileCheck -check-prefix=NORDC-NEG %s
// RUN: %clang_cc1 -triple nvptx -fcuda-is-device -fgpu-rdc -emit-llvm -o - %s \
// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
// RUN: -fgpu-rdc -emit-llvm -o - %s \
// RUN: | FileCheck -check-prefix=RDC %s
// RUN: %clang_cc1 -triple nvptx -fcuda-is-device -fgpu-rdc -emit-llvm -o - %s \
// RUN: | FileCheck -check-prefix=RDC-NEG %s

#include "Inputs/cuda.h"

// NORDC: define internal void @_Z4funcIiEvv()
// NORDC: define{{.*}} void @_Z6kernelIiEvv()
// RDC: define weak_odr void @_Z4funcIiEvv()
// RDC: define weak_odr void @_Z6kernelIiEvv()

template <typename T> __device__ void func() {}
template <typename T> __global__ void kernel() {}

template __device__ void func<int>();
// NORDC: define internal void @_Z4funcIiEvv()
// RDC: define weak_odr void @_Z4funcIiEvv()

template __global__ void kernel<int>();
// NORDC: define void @_Z6kernelIiEvv()
// RDC: define weak_odr void @_Z6kernelIiEvv()

// Ensure that unused static device function is eliminated
static __device__ void static_func() {}
// NORDC-NEG-NOT: define{{.*}} void @_ZL13static_funcv()
// RDC-NEG-NOT: define{{.*}} void @_ZL13static_funcv()

// Ensure that kernel function has external or weak_odr
// linkage regardless static specifier
static __global__ void static_kernel() {}
// NORDC: define void @_ZL13static_kernelv()
// RDC: define weak_odr void @_ZL13static_kernelv()
38 changes: 15 additions & 23 deletions llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
Expand Up @@ -329,7 +329,7 @@ MCOperand NVPTXAsmPrinter::GetSymbolRef(const MCSymbol *Symbol) {
void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
const DataLayout &DL = getDataLayout();
const NVPTXSubtarget &STI = TM.getSubtarget<NVPTXSubtarget>(*F);
const auto *TLI = cast<NVPTXTargetLowering>(STI.getTargetLowering());
const TargetLowering *TLI = STI.getTargetLowering();

Type *Ty = F->getReturnType();

Expand Down Expand Up @@ -363,7 +363,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
unsigned totalsz = DL.getTypeAllocSize(Ty);
unsigned retAlignment = 0;
if (!getAlign(*F, 0, retAlignment))
retAlignment = TLI->getFunctionParamOptimizedAlign(F, Ty, DL).value();
retAlignment = DL.getABITypeAlignment(Ty);
O << ".param .align " << retAlignment << " .b8 func_retval0[" << totalsz
<< "]";
} else
Expand Down Expand Up @@ -1348,8 +1348,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
const DataLayout &DL = getDataLayout();
const AttributeList &PAL = F->getAttributes();
const NVPTXSubtarget &STI = TM.getSubtarget<NVPTXSubtarget>(*F);
const auto *TLI = cast<NVPTXTargetLowering>(STI.getTargetLowering());

const TargetLowering *TLI = STI.getTargetLowering();
Function::const_arg_iterator I, E;
unsigned paramIndex = 0;
bool first = true;
Expand Down Expand Up @@ -1406,24 +1405,18 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
}
}

auto getOptimalAlignForParam = [TLI, &DL, &PAL, F,
paramIndex](Type *Ty) -> Align {
Align TypeAlign = TLI->getFunctionParamOptimizedAlign(F, Ty, DL);
MaybeAlign ParamAlign = PAL.getParamAlignment(paramIndex);
return max(TypeAlign, ParamAlign);
};

if (!PAL.hasParamAttr(paramIndex, Attribute::ByVal)) {
if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
// Just print .param .align <a> .b8 .param[size];
// <a> = optimal alignment for the element type; always multiple of
// PAL.getParamAlignment
// <a> = PAL.getparamalignment
// size = typeallocsize of element type
Align OptimalAlign = getOptimalAlignForParam(Ty);
const Align align = DL.getValueOrABITypeAlignment(
PAL.getParamAlignment(paramIndex), Ty);

O << "\t.param .align " << OptimalAlign.value() << " .b8 ";
unsigned sz = DL.getTypeAllocSize(Ty);
O << "\t.param .align " << align.value() << " .b8 ";
printParamName(I, paramIndex, O);
O << "[" << DL.getTypeAllocSize(Ty) << "]";
O << "[" << sz << "]";

continue;
}
Expand Down Expand Up @@ -1499,11 +1492,10 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {

if (isABI || isKernelFunc) {
// Just print .param .align <a> .b8 .param[size];
// <a> = optimal alignment for the element type; always multiple of
// PAL.getParamAlignment
// <a> = PAL.getparamalignment
// size = typeallocsize of element type
Align OptimalAlign = getOptimalAlignForParam(ETy);

Align align =
DL.getValueOrABITypeAlignment(PAL.getParamAlignment(paramIndex), ETy);
// Work around a bug in ptxas. When PTX code takes address of
// byval parameter with alignment < 4, ptxas generates code to
// spill argument into memory. Alas on sm_50+ ptxas generates
Expand All @@ -1515,10 +1507,10 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
// TODO: this will need to be undone when we get to support multi-TU
// device-side compilation as it breaks ABI compatibility with nvcc.
// Hopefully ptxas bug is fixed by then.
if (!isKernelFunc && OptimalAlign < Align(4))
OptimalAlign = Align(4);
if (!isKernelFunc && align < Align(4))
align = Align(4);
unsigned sz = DL.getTypeAllocSize(ETy);
O << "\t.param .align " << OptimalAlign.value() << " .b8 ";
O << "\t.param .align " << align.value() << " .b8 ";
printParamName(I, paramIndex, O);
O << "[" << sz << "]";
continue;
Expand Down
166 changes: 30 additions & 136 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Expand Up @@ -1302,8 +1302,8 @@ std::string NVPTXTargetLowering::getPrototype(

bool first = true;

const Function *F = CB.getFunction();
for (unsigned i = 0, e = Args.size(), OIdx = 0; i != e; ++i, ++OIdx) {
unsigned OIdx = 0;
for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
Type *Ty = Args[i].Ty;
if (!first) {
O << ", ";
Expand All @@ -1312,14 +1312,15 @@ std::string NVPTXTargetLowering::getPrototype(

if (!Outs[OIdx].Flags.isByVal()) {
if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
unsigned ParamAlign = 0;
unsigned align = 0;
const CallInst *CallI = cast<CallInst>(&CB);
// +1 because index 0 is reserved for return type alignment
if (!getAlign(*CallI, i + 1, ParamAlign))
ParamAlign = getFunctionParamOptimizedAlign(F, Ty, DL).value();
O << ".param .align " << ParamAlign << " .b8 ";
if (!getAlign(*CallI, i + 1, align))
align = DL.getABITypeAlignment(Ty);
unsigned sz = DL.getTypeAllocSize(Ty);
O << ".param .align " << align << " .b8 ";
O << "_";
O << "[" << DL.getTypeAllocSize(Ty) << "]";
O << "[" << sz << "]";
// update the index for Outs
SmallVector<EVT, 16> vtparts;
ComputeValueVTs(*this, DL, Ty, vtparts);
Expand Down Expand Up @@ -1351,17 +1352,11 @@ std::string NVPTXTargetLowering::getPrototype(
continue;
}

Align ParamByValAlign = Outs[OIdx].Flags.getNonZeroByValAlign();

// Try to increase alignment. This code matches logic in LowerCall when
// alignment increase is performed to increase vectorization options.
Type *ETy = Args[i].IndirectType;
Align AlignCandidate = getFunctionParamOptimizedAlign(F, ETy, DL);
ParamByValAlign = std::max(ParamByValAlign, AlignCandidate);

O << ".param .align " << ParamByValAlign.value() << " .b8 ";
Align align = Outs[OIdx].Flags.getNonZeroByValAlign();
unsigned sz = Outs[OIdx].Flags.getByValSize();
O << ".param .align " << align.value() << " .b8 ";
O << "_";
O << "[" << Outs[OIdx].Flags.getByValSize() << "]";
O << "[" << sz << "]";
}
O << ");";
return O.str();
Expand Down Expand Up @@ -1408,15 +1403,12 @@ Align NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,

// Check for function alignment information if we found that the
// ultimate target is a Function
if (DirectCallee) {
if (DirectCallee)
if (getAlign(*DirectCallee, Idx, Alignment))
return Align(Alignment);
// If alignment information is not available, fall back to the
// default function param optimized type alignment
return getFunctionParamOptimizedAlign(DirectCallee, Ty, DL);
}

// Call is indirect, fall back to the ABI type alignment
// Call is indirect or alignment information is not available, fall back to
// the ABI type alignment
return DL.getABITypeAlign(Ty);
}

Expand Down Expand Up @@ -1577,26 +1569,18 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
}

// ByVal arguments
// TODO: remove code duplication when handling byval and non-byval cases.
SmallVector<EVT, 16> VTs;
SmallVector<uint64_t, 16> Offsets;
Type *ETy = Args[i].IndirectType;
assert(ETy && "byval arg must have indirect type");
ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, 0);
assert(Args[i].IndirectType && "byval arg must have indirect type");
ComputePTXValueVTs(*this, DL, Args[i].IndirectType, VTs, &Offsets, 0);

// declare .param .align <align> .b8 .param<n>[<size>];
unsigned sz = Outs[OIdx].Flags.getByValSize();
SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);

Align ArgAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
// The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
// so we don't need to worry about natural alignment or not.
// See TargetLowering::LowerCallTo().
Align ArgAlign = Outs[OIdx].Flags.getNonZeroByValAlign();

// Try to increase alignment to enhance vectorization options.
const Function *F = CB->getCalledFunction();
Align AlignCandidate = getFunctionParamOptimizedAlign(F, ETy, DL);
ArgAlign = std::max(ArgAlign, AlignCandidate);

// Enforce minumum alignment of 4 to work around ptxas miscompile
// for sm_50+. See corresponding alignment adjustment in
Expand All @@ -1610,67 +1594,29 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
DeclareParamOps);
InFlag = Chain.getValue(1);

auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
SmallVector<SDValue, 6> StoreOperands;
for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
EVT elemtype = VTs[j];
int curOffset = Offsets[j];
Align PartAlign = commonAlignment(ArgAlign, curOffset);

// New store.
if (VectorInfo[j] & PVF_FIRST) {
assert(StoreOperands.empty() && "Unfinished preceding store.");
StoreOperands.push_back(Chain);
StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32));
StoreOperands.push_back(DAG.getConstant(curOffset, dl, MVT::i32));
}

unsigned PartAlign = GreatestCommonDivisor64(ArgAlign.value(), curOffset);
auto PtrVT = getPointerTy(DL);
SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
DAG.getConstant(curOffset, dl, PtrVT));
SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
MachinePointerInfo(), PartAlign);

if (elemtype.getSizeInBits() < 16) {
// Use 16-bit registers for small stores as it's the
// smallest general purpose register size supported by NVPTX.
theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
}
SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue CopyParamOps[] = { Chain,
DAG.getConstant(paramCount, dl, MVT::i32),
DAG.getConstant(curOffset, dl, MVT::i32),
theVal, InFlag };
Chain = DAG.getMemIntrinsicNode(
NVPTXISD::StoreParam, dl, CopyParamVTs, CopyParamOps, elemtype,
MachinePointerInfo(), /* Align */ None, MachineMemOperand::MOStore);

// Record the value to store.
StoreOperands.push_back(theVal);

if (VectorInfo[j] & PVF_LAST) {
unsigned NumElts = StoreOperands.size() - 3;
NVPTXISD::NodeType Op;
switch (NumElts) {
case 1:
Op = NVPTXISD::StoreParam;
break;
case 2:
Op = NVPTXISD::StoreParamV2;
break;
case 4:
Op = NVPTXISD::StoreParamV4;
break;
default:
llvm_unreachable("Invalid vector info.");
}

StoreOperands.push_back(InFlag);

Chain = DAG.getMemIntrinsicNode(
Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
elemtype, MachinePointerInfo(), PartAlign,
MachineMemOperand::MOStore);
InFlag = Chain.getValue(1);

// Cleanup.
StoreOperands.clear();
}
InFlag = Chain.getValue(1);
}
assert(StoreOperands.empty() && "Unfinished parameter store.");
++paramCount;
}

Expand Down Expand Up @@ -2671,8 +2617,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SDLoc &dl, SelectionDAG &DAG) const {
const MachineFunction &MF = DAG.getMachineFunction();
const Function &F = MF.getFunction();
MachineFunction &MF = DAG.getMachineFunction();
Type *RetTy = MF.getFunction().getReturnType();

bool isABI = (STI.getSmVersion() >= 20);
Expand All @@ -2687,9 +2632,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
assert(VTs.size() == OutVals.size() && "Bad return value decomposition");

auto VectorInfo = VectorizePTXValueVTs(
VTs, Offsets,
RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL)
: Align(1));
VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlign(RetTy) : Align(1));

// PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
// 32-bits are sign extended or zero extended, depending on whether
Expand Down Expand Up @@ -4309,55 +4252,6 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
return false;
}

/// getFunctionParamOptimizedAlign - since function arguments are passed via
/// .param space, we may want to increase their alignment in a way that
/// ensures that we can effectively vectorize their loads & stores. We can
/// increase alignment only if the function has internal or has private
/// linkage as for other linkage types callers may already rely on default
/// alignment. To allow using 128-bit vectorized loads/stores, this function
/// ensures that alignment is 16 or greater.
Align NVPTXTargetLowering::getFunctionParamOptimizedAlign(
const Function *F, Type *ArgTy, const DataLayout &DL) const {
const uint64_t ABITypeAlign = DL.getABITypeAlign(ArgTy).value();

// If a function has linkage different from internal or private, we
// must use default ABI alignment as external users rely on it.
switch (F->getLinkage()) {
case GlobalValue::InternalLinkage:
case GlobalValue::PrivateLinkage: {
// Check that if a function has internal or private linkage
// it is not a kernel.
#ifndef NDEBUG
const NamedMDNode *NMDN =
F->getParent()->getNamedMetadata("nvvm.annotations");
if (NMDN) {
for (const MDNode *MDN : NMDN->operands()) {
assert(MDN->getNumOperands() == 3);

const Metadata *MD0 = MDN->getOperand(0).get();
const auto *MDV0 = cast<ConstantAsMetadata>(MD0)->getValue();
const auto *MDFn = cast<Function>(MDV0);
if (MDFn != F)
continue;

const Metadata *MD1 = MDN->getOperand(1).get();
const MDString *MDStr = cast<MDString>(MD1);
if (MDStr->getString() != "kernel")
continue;

const Metadata *MD2 = MDN->getOperand(2).get();
const auto *MDV2 = cast<ConstantAsMetadata>(MD2)->getValue();
assert(!cast<ConstantInt>(MDV2)->isZero());
}
}
#endif
return Align(std::max(uint64_t(16), ABITypeAlign));
}
default:
return Align(ABITypeAlign);
}
}

/// isLegalAddressingMode - Return true if the addressing mode represented
/// by AM is legal for this target, for a load/store of the specified type.
/// Used to guide target specific optimizations, like loop strength reduction
Expand Down

0 comments on commit a034878

Please sign in to comment.