Skip to content

Commit

Permalink
[AMDGPU] Promote recursive loads from kernel argument to constant
Browse files Browse the repository at this point in the history
Not clobbered pointer load chains are promoted to global now. That
is possible to promote these loads itself into constant address
space. Loaded pointers still need to point to global because we
need to be able to store into that pointer and because an actual
load from it may occur after a clobber.

Differential Revision: https://reviews.llvm.org/D119886
  • Loading branch information
rampitec committed Feb 17, 2022
1 parent 1af15de commit b0aa194
Show file tree
Hide file tree
Showing 3 changed files with 273 additions and 58 deletions.
12 changes: 7 additions & 5 deletions clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
Expand Up @@ -18,7 +18,7 @@
// COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel1Pi(i32 addrspace(1)*{{.*}} %x.coerce)
// CHECK: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
// CHECK-NOT: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
// OPT: [[VAL:%.*]] = load i32, i32 addrspace(1)* %x.coerce, align 4
// OPT: [[VAL:%.*]] = load i32, i32 addrspace(4)* %x.coerce.const, align 4
// OPT: [[INC:%.*]] = add nsw i32 [[VAL]], 1
// OPT: store i32 [[INC]], i32 addrspace(1)* %x.coerce, align 4
// OPT: ret void
Expand All @@ -30,7 +30,7 @@ __global__ void kernel1(int *x) {
// COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel2Ri(i32 addrspace(1)*{{.*}} nonnull align 4 dereferenceable(4) %x.coerce)
// CHECK: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
// CHECK-NOT: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
// OPT: [[VAL:%.*]] = load i32, i32 addrspace(1)* %x.coerce, align 4
// OPT: [[VAL:%.*]] = load i32, i32 addrspace(4)* %x.coerce.const, align 4
// OPT: [[INC:%.*]] = add nsw i32 [[VAL]], 1
// OPT: store i32 [[INC]], i32 addrspace(1)* %x.coerce, align 4
// OPT: ret void
Expand Down Expand Up @@ -68,7 +68,8 @@ struct S {
// OPT: [[R1:%.*]] = getelementptr inbounds %struct.S, %struct.S addrspace(4)* %0, i64 0, i32 1
// OPT: [[P1:%.*]] = load float*, float* addrspace(4)* [[R1]], align 8
// OPT: [[G1:%.*]] ={{.*}} addrspacecast float* [[P1]] to float addrspace(1)*
// OPT: [[V0:%.*]] = load i32, i32 addrspace(1)* [[G0]], align 4
// OPT: [[G2:%.*]] ={{.*}} addrspacecast i32* [[P0]] to i32 addrspace(4)*
// OPT: [[V0:%.*]] = load i32, i32 addrspace(4)* [[G2]], align 4
// OPT: [[INC:%.*]] = add nsw i32 [[V0]], 1
// OPT: store i32 [[INC]], i32 addrspace(1)* [[G0]], align 4
// OPT: [[V1:%.*]] = load float, float addrspace(1)* [[G1]], align 4
Expand Down Expand Up @@ -103,7 +104,8 @@ struct T {
// OPT: [[R1:%.*]] = getelementptr inbounds %struct.T, %struct.T addrspace(4)* %0, i64 0, i32 0, i64 1
// OPT: [[P1:%.*]] = load float*, float* addrspace(4)* [[R1]], align 8
// OPT: [[G1:%.*]] ={{.*}} addrspacecast float* [[P1]] to float addrspace(1)*
// OPT: [[V0:%.*]] = load float, float addrspace(1)* [[G0]], align 4
// OPT: [[G2:%.*]] ={{.*}} addrspacecast float* [[P0]] to float addrspace(4)*
// OPT: [[V0:%.*]] = load float, float addrspace(4)* [[G2]], align 4
// OPT: [[ADD0:%.*]] = fadd contract float [[V0]], 1.000000e+00
// OPT: store float [[ADD0]], float addrspace(1)* [[G0]], align 4
// OPT: [[V1:%.*]] = load float, float addrspace(1)* [[G1]], align 4
Expand All @@ -130,7 +132,7 @@ struct SS {
// COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel82SS(float addrspace(1)*{{.*}} %a.coerce)
// CHECK: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
// CHECK-NOT: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
// OPT: [[VAL:%.*]] = load float, float addrspace(1)* %a.coerce, align 4
// OPT: [[VAL:%.*]] = load float, float addrspace(4)* %a.coerce.const, align 4
// OPT: [[INC:%.*]] = fadd contract float [[VAL]], 3.000000e+00
// OPT: store float [[INC]], float addrspace(1)* %a.coerce, align 4
// OPT: ret void
Expand Down
69 changes: 54 additions & 15 deletions llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
Expand Up @@ -42,6 +42,8 @@ class AMDGPUPromoteKernelArguments : public FunctionPass {

bool promotePointer(Value *Ptr);

bool promoteLoad(LoadInst *LI);

public:
static char ID;

Expand Down Expand Up @@ -73,16 +75,10 @@ void AMDGPUPromoteKernelArguments::enqueueUsers(Value *Ptr) {
break;
case Instruction::Load: {
LoadInst *LD = cast<LoadInst>(U);
PointerType *PT = dyn_cast<PointerType>(LD->getType());
if (!PT ||
(PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS &&
PT->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS &&
PT->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) ||
LD->getPointerOperand()->stripInBoundsOffsets() != Ptr)
break;
// TODO: This load poprobably can be promoted to constant address space.
if (!AMDGPU::isClobberedInFunction(LD, MSSA, AA))
if (LD->getPointerOperand()->stripInBoundsOffsets() == Ptr &&
!AMDGPU::isClobberedInFunction(LD, MSSA, AA))
Ptrs.push_back(LD);

break;
}
case Instruction::GetElementPtr:
Expand All @@ -96,15 +92,26 @@ void AMDGPUPromoteKernelArguments::enqueueUsers(Value *Ptr) {
}

bool AMDGPUPromoteKernelArguments::promotePointer(Value *Ptr) {
enqueueUsers(Ptr);
bool Changed = false;

LoadInst *LI = dyn_cast<LoadInst>(Ptr);
if (LI)
Changed |= promoteLoad(LI);

PointerType *PT = dyn_cast<PointerType>(Ptr->getType());
if (!PT)
return Changed;

if (PT->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
PT->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
PT->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS)
enqueueUsers(Ptr);

PointerType *PT = cast<PointerType>(Ptr->getType());
if (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
return false;
return Changed;

bool IsArg = isa<Argument>(Ptr);
IRBuilder<> B(IsArg ? ArgCastInsertPt
: &*std::next(cast<Instruction>(Ptr)->getIterator()));
IRBuilder<> B(LI ? &*std::next(cast<Instruction>(Ptr)->getIterator())
: ArgCastInsertPt);

// Cast pointer to global address space and back to flat and let
// Infer Address Spaces pass to do all necessary rewriting.
Expand All @@ -120,6 +127,38 @@ bool AMDGPUPromoteKernelArguments::promotePointer(Value *Ptr) {
return true;
}

bool AMDGPUPromoteKernelArguments::promoteLoad(LoadInst *LI) {
if (!LI->isSimple())
return false;

Value *Ptr = LI->getPointerOperand();

// Strip casts we have created earlier.
Value *OrigPtr = Ptr;
PointerType *PT;
for ( ; ; ) {
PT = cast<PointerType>(OrigPtr->getType());
if (PT->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS)
return false;
auto *P = dyn_cast<AddrSpaceCastInst>(OrigPtr);
if (!P)
break;
auto *NewPtr = P->getPointerOperand();
if (!cast<PointerType>(NewPtr->getType())->hasSameElementTypeAs(PT))
break;
OrigPtr = NewPtr;
}

IRBuilder<> B(LI);

PointerType *NewPT =
PointerType::getWithSamePointeeType(PT, AMDGPUAS::CONSTANT_ADDRESS);
Value *Cast = B.CreateAddrSpaceCast(OrigPtr, NewPT,
Twine(OrigPtr->getName(), ".const"));
LI->replaceUsesOfWith(Ptr, Cast);
return true;
}

// skip allocas
static BasicBlock::iterator getInsertPt(BasicBlock &BB) {
BasicBlock::iterator InsPt = BB.getFirstInsertionPt();
Expand Down

0 comments on commit b0aa194

Please sign in to comment.