Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 14 additions & 9 deletions clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ __global__ void kernel1(int *x) {
// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr [[X_ASCAST]], align 8
// CHECK-NEXT: [[X1:%.*]] = load ptr, ptr [[X_ASCAST]], align 8
// CHECK-NEXT: store ptr [[X1]], ptr [[X_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8, !nonnull [[META4:![0-9]+]], !align [[META5:![0-9]+]]
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1
// CHECK-NEXT: store i32 [[INC]], ptr [[TMP0]], align 4
Expand All @@ -111,7 +111,7 @@ __global__ void kernel1(int *x) {
// CHECK-SPIRV-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr addrspace(4) [[X_ASCAST]], align 8
// CHECK-SPIRV-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8
// CHECK-SPIRV-NEXT: store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
// CHECK-SPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8
// CHECK-SPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8, !align [[META6:![0-9]+]]
// CHECK-SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[TMP0]], align 4
// CHECK-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1
// CHECK-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(4) [[TMP0]], align 4
Expand Down Expand Up @@ -435,14 +435,15 @@ __global__ void kernel4(struct S s) {
// OPT-SAME: ptr addrspace(1) noundef readonly captures(none) [[S_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
// OPT-NEXT: [[ENTRY:.*:]]
// OPT-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(1) [[S_COERCE]], align 8
// OPT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
// OPT-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1
// OPT-NEXT: store i32 [[INC]], ptr [[TMP0]], align 4
// OPT-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(1)
// OPT-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) [[TMP1]], align 4
// OPT-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1
// OPT-NEXT: store i32 [[INC]], ptr addrspace(1) [[TMP1]], align 4
// OPT-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[S_COERCE]], i64 8
// OPT-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(1) [[Y]], align 8
// OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP2]], align 4
// OPT-NEXT: [[ADD:%.*]] = fadd contract float [[TMP3]], 1.000000e+00
// OPT-NEXT: store float [[ADD]], ptr [[TMP2]], align 4
// OPT-NEXT: [[TMP3:%.*]] = load ptr, ptr addrspace(1) [[Y]], align 8
// OPT-NEXT: [[TMP4:%.*]] = load float, ptr [[TMP3]], align 4
// OPT-NEXT: [[ADD:%.*]] = fadd contract float [[TMP4]], 1.000000e+00
// OPT-NEXT: store float [[ADD]], ptr [[TMP3]], align 4
// OPT-NEXT: ret void
//
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel5P1S(
Expand Down Expand Up @@ -727,7 +728,11 @@ __global__ void kernel8(struct SS a) {
*a.x += 3.f;
}
//.
// CHECK: [[META4]] = !{}
// CHECK: [[META5]] = !{i64 4}
//.
// CHECK-SPIRV: [[META5]] = !{i32 1024, i32 1, i32 1}
// CHECK-SPIRV: [[META6]] = !{i64 4}
//.
// OPT: [[META4]] = !{}
//.
Expand Down
6 changes: 6 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,8 @@ class TargetTransformInfo {

LLVM_ABI unsigned getAssumedAddrSpace(const Value *V) const;

LLVM_ABI unsigned getAssumedLiveOnEntryDefAddrSpace(const Value *V) const;

LLVM_ABI bool isSingleThreaded() const;

LLVM_ABI std::pair<const Value *, unsigned>
Expand All @@ -577,6 +579,10 @@ class TargetTransformInfo {
Value *OldV,
Value *NewV) const;

/// Return true if \p IID only performs an artificial clobber to facilitate
/// ordering constraints.
LLVM_ABI bool isArtificialClobber(Intrinsic::ID IID) const;

/// Test whether calls to a function lower to actual program function
/// calls.
///
Expand Down
6 changes: 6 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,10 @@ class TargetTransformInfoImplBase {

virtual unsigned getAssumedAddrSpace(const Value *V) const { return -1; }

virtual unsigned getAssumedLiveOnEntryDefAddrSpace(const Value *V) const {
return -1;
}

virtual bool isSingleThreaded() const { return false; }

virtual std::pair<const Value *, unsigned>
Expand All @@ -171,6 +175,8 @@ class TargetTransformInfoImplBase {
return nullptr;
}

virtual bool isArtificialClobber(Intrinsic::ID IID) const { return false; }

virtual bool isLoweredToCall(const Function *F) const {
assert(F && "A concrete function must be provided to this routine.");

Expand Down
6 changes: 6 additions & 0 deletions llvm/include/llvm/CodeGen/BasicTTIImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
}

unsigned getAssumedLiveOnEntryDefAddrSpace(const Value *V) const override {
return getTLI()->getTargetMachine().getAssumedLiveOnEntryDefAddrSpace(V);
}

bool isSingleThreaded() const override {
return getTLI()->getTargetMachine().Options.ThreadModel ==
ThreadModel::Single;
Expand All @@ -455,6 +459,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return nullptr;
}

bool isArtificialClobber(Intrinsic::ID IID) const override { return false; }

bool isLegalAddImmediate(int64_t imm) const override {
return getTLI()->isLegalAddImmediate(imm);
}
Expand Down
8 changes: 8 additions & 0 deletions llvm/include/llvm/Target/TargetMachine.h
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,14 @@ class LLVM_ABI TargetMachine {
/// properties.
virtual unsigned getAssumedAddrSpace(const Value *V) const { return -1; }

/// LiveOnEntryDef same as MemorySSA's concept.
/// Loads and stores from pointer arguments and other global values may be
/// defined by memory operations that do not occur in the current function.
/// Return the assumed address space for such memory operations.
virtual unsigned getAssumedLiveOnEntryDefAddrSpace(const Value *V) const {
return -1;
}

/// If the specified predicate checks whether a generic pointer falls within
/// a specified address space, return that generic pointer and the address
/// space being queried.
Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/Analysis/TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,11 @@ unsigned TargetTransformInfo::getAssumedAddrSpace(const Value *V) const {
return TTIImpl->getAssumedAddrSpace(V);
}

unsigned
TargetTransformInfo::getAssumedLiveOnEntryDefAddrSpace(const Value *V) const {
return TTIImpl->getAssumedLiveOnEntryDefAddrSpace(V);
}

bool TargetTransformInfo::isSingleThreaded() const {
return TTIImpl->isSingleThreaded();
}
Expand All @@ -353,6 +358,10 @@ Value *TargetTransformInfo::rewriteIntrinsicWithAddressSpace(
return TTIImpl->rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
}

bool TargetTransformInfo::isArtificialClobber(Intrinsic::ID IID) const {
return TTIImpl->isArtificialClobber(IID);
}

bool TargetTransformInfo::isLoweredToCall(const Function *F) const {
return TTIImpl->isLoweredToCall(F);
}
Expand Down
37 changes: 1 addition & 36 deletions llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@ class AMDGPUPromoteKernelArguments : public FunctionPass {

AliasAnalysis *AA;

Instruction *ArgCastInsertPt;

SmallVector<Value *> Ptrs;

void enqueueUsers(Value *Ptr);
Expand Down Expand Up @@ -107,24 +105,7 @@ bool AMDGPUPromoteKernelArguments::promotePointer(Value *Ptr) {
PT->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS)
enqueueUsers(Ptr);

if (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
return Changed;

IRBuilder<> B(LI ? &*std::next(cast<Instruction>(Ptr)->getIterator())
: ArgCastInsertPt);

// Cast pointer to global address space and back to flat and let
// Infer Address Spaces pass to do all necessary rewriting.
PointerType *NewPT =
PointerType::get(PT->getContext(), AMDGPUAS::GLOBAL_ADDRESS);
Value *Cast =
B.CreateAddrSpaceCast(Ptr, NewPT, Twine(Ptr->getName(), ".global"));
Value *CastBack =
B.CreateAddrSpaceCast(Cast, PT, Twine(Ptr->getName(), ".flat"));
Ptr->replaceUsesWithIf(CastBack,
[Cast](Use &U) { return U.getUser() != Cast; });

return true;
return Changed;
}

bool AMDGPUPromoteKernelArguments::promoteLoad(LoadInst *LI) {
Expand All @@ -135,21 +116,6 @@ bool AMDGPUPromoteKernelArguments::promoteLoad(LoadInst *LI) {
return true;
}

// skip allocas
static BasicBlock::iterator getInsertPt(BasicBlock &BB) {
BasicBlock::iterator InsPt = BB.getFirstInsertionPt();
for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) {
AllocaInst *AI = dyn_cast<AllocaInst>(&*InsPt);

// If this is a dynamic alloca, the value may depend on the loaded kernargs,
// so loads will need to be inserted before it.
if (!AI || !AI->isStaticAlloca())
break;
}

return InsPt;
}

bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA,
AliasAnalysis &AA) {
if (skipFunction(F))
Expand All @@ -159,7 +125,6 @@ bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA,
if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
return false;

ArgCastInsertPt = &*getInsertPt(*F.begin());
this->MSSA = &MSSA;
this->AA = &AA;

Expand Down
31 changes: 28 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1042,13 +1042,38 @@ unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
assert(V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);

const auto *Ptr = LD->getPointerOperand();
if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;

// For a generic pointer loaded from the constant memory, it could be assumed
// as a global pointer since the constant memory is only populated on the
// host side. As implied by the offload programming model, only global
// pointers could be referenced on the host side.
return AMDGPUAS::GLOBAL_ADDRESS;
if (Ptr->getType()->getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS)
return AMDGPUAS::GLOBAL_ADDRESS;

// For a generic pointer loaded from the readonly and noalias arg, same as
// above.
if (const Argument *Arg = dyn_cast<Argument>(getUnderlyingObject(Ptr)))
if (AMDGPU::isModuleEntryFunctionCC(Arg->getParent()->getCallingConv()) &&
Arg->onlyReadsMemory() && Arg->hasNoAliasAttr())
return AMDGPUAS::GLOBAL_ADDRESS;

return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
}

unsigned
AMDGPUTargetMachine::getAssumedLiveOnEntryDefAddrSpace(const Value *V) const {
if (const Instruction *I = dyn_cast<Instruction>(V)) {
if (AMDGPU::isModuleEntryFunctionCC(
I->getParent()->getParent()->getCallingConv()))
return AMDGPUAS::GLOBAL_ADDRESS;
}
if (const LoadInst *LD = dyn_cast<LoadInst>(V)) {
// same as getAssumedAddrSpace
if (LD->getPointerOperandType()->getPointerAddressSpace() ==
AMDGPUAS::CONSTANT_ADDRESS)
return AMDGPUAS::GLOBAL_ADDRESS;
}
return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
}

std::pair<const Value *, unsigned>
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ class AMDGPUTargetMachine : public CodeGenTargetMachineImpl {

unsigned getAssumedAddrSpace(const Value *V) const override;

unsigned getAssumedLiveOnEntryDefAddrSpace(const Value *V) const override;

std::pair<const Value *, unsigned>
getPredicatedAddrSpace(const Value *V) const override;

Expand Down
22 changes: 22 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1223,6 +1223,28 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
}
}

bool GCNTTIImpl::isArtificialClobber(Intrinsic::ID IID) const {
switch (IID) {
case Intrinsic::amdgcn_s_barrier:
case Intrinsic::amdgcn_s_cluster_barrier:
case Intrinsic::amdgcn_s_barrier_signal:
case Intrinsic::amdgcn_s_barrier_signal_var:
case Intrinsic::amdgcn_s_barrier_signal_isfirst:
case Intrinsic::amdgcn_s_barrier_init:
case Intrinsic::amdgcn_s_barrier_join:
case Intrinsic::amdgcn_s_barrier_wait:
case Intrinsic::amdgcn_s_barrier_leave:
case Intrinsic::amdgcn_s_get_barrier_state:
case Intrinsic::amdgcn_wave_barrier:
case Intrinsic::amdgcn_sched_barrier:
case Intrinsic::amdgcn_sched_group_barrier:
case Intrinsic::amdgcn_iglp_opt:
return true;
default:
return false;
}
}

InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
VectorType *DstTy, VectorType *SrcTy,
ArrayRef<int> Mask,
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,8 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
Value *NewV) const override;

bool isArtificialClobber(Intrinsic::ID IID) const override;

bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0,
const Value *Op1, InstCombiner &IC) const;

Expand Down
48 changes: 48 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,32 @@ Value *NVPTXTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
return nullptr;
}

bool NVPTXTTIImpl::isArtificialClobber(Intrinsic::ID IID) const {
switch (IID) {
case Intrinsic::nvvm_bar_warp_sync:
case Intrinsic::nvvm_barrier_cluster_arrive:
case Intrinsic::nvvm_barrier_cluster_arrive_aligned:
case Intrinsic::nvvm_barrier_cluster_arrive_relaxed:
case Intrinsic::nvvm_barrier_cluster_arrive_relaxed_aligned:
case Intrinsic::nvvm_barrier_cluster_wait:
case Intrinsic::nvvm_barrier_cluster_wait_aligned:
case Intrinsic::nvvm_barrier_cta_arrive_aligned_count:
case Intrinsic::nvvm_barrier_cta_arrive_count:
case Intrinsic::nvvm_barrier_cta_sync_aligned_all:
case Intrinsic::nvvm_barrier_cta_sync_aligned_count:
case Intrinsic::nvvm_barrier_cta_sync_all:
case Intrinsic::nvvm_barrier_cta_sync_count:
case Intrinsic::nvvm_barrier0_and:
case Intrinsic::nvvm_barrier0_or:
case Intrinsic::nvvm_barrier0_popc:
case Intrinsic::nvvm_membar_cta:
case Intrinsic::nvvm_membar_gl:
case Intrinsic::nvvm_membar_sys:
return true;
default:
return false;
}
}
bool NVPTXTTIImpl::isLegalMaskedStore(Type *DataTy, Align Alignment,
unsigned AddrSpace,
TTI::MaskKind MaskKind) const {
Expand Down Expand Up @@ -657,6 +683,28 @@ unsigned NVPTXTTIImpl::getAssumedAddrSpace(const Value *V) const {
}
}

if (const auto *LD = dyn_cast<LoadInst>(V)) {
// It must be a generic pointer loaded.
assert(V->getType()->getPointerAddressSpace() == ADDRESS_SPACE_GENERIC);

// For a generic pointer loaded from the readonly and noalias arg, it could
// be assumed as a global pointer since the readonly memory is only
// populated on the host side.
if (const Argument *Arg =
dyn_cast<Argument>(getUnderlyingObject(LD->getPointerOperand())))
if (isKernelFunction(*Arg->getParent()) && Arg->onlyReadsMemory() &&
Arg->hasNoAliasAttr())
return ADDRESS_SPACE_GLOBAL;
}
return -1;
}

unsigned NVPTXTTIImpl::getAssumedLiveOnEntryDefAddrSpace(const Value *V) const {
if (const Instruction *I = dyn_cast<Instruction>(V)) {
if (isKernelFunction(*I->getParent()->getParent())) {
return ADDRESS_SPACE_GLOBAL;
}
}
return -1;
}

Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,13 @@ class NVPTXTTIImpl final : public BasicTTIImplBase<NVPTXTTIImpl> {

Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
Value *NewV) const override;

bool isArtificialClobber(Intrinsic::ID IID) const override;

unsigned getAssumedAddrSpace(const Value *V) const override;

unsigned getAssumedLiveOnEntryDefAddrSpace(const Value *V) const override;

void collectKernelLaunchBounds(
const Function &F,
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override;
Expand Down
Loading
Loading