Skip to content

Commit

Permalink
[AMDGPU] Fix promote alloca with double use in a same insn
Browse files Browse the repository at this point in the history
If we have an instruction where more than one pointer operands
are derived from the same promoted alloca, we are fixing it for
one argument and do not fix a second use considering this user
done.

Fix this by deferring processing of memory intrinsics until all
potential operands are replaced.

Fixes: SWDEV-271358

Differential Revision: https://reviews.llvm.org/D96386
  • Loading branch information
rampitec committed Feb 11, 2021
1 parent 8151c1b commit cb41ee9
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 15 deletions.
44 changes: 29 additions & 15 deletions llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -943,6 +943,8 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
I.replaceAllUsesWith(Offset);
I.eraseFromParent();

SmallVector<IntrinsicInst *> DeferredIntrs;

for (Value *V : WorkList) {
CallInst *Call = dyn_cast<CallInst>(V);
if (!Call) {
Expand Down Expand Up @@ -997,22 +999,13 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
// These intrinsics are for address space 0 only
Intr->eraseFromParent();
continue;
case Intrinsic::memcpy: {
MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getDestAlign(),
MemCpy->getRawSource(), MemCpy->getSourceAlign(),
MemCpy->getLength(), MemCpy->isVolatile());
Intr->eraseFromParent();
case Intrinsic::memcpy:
case Intrinsic::memmove:
// These have 2 pointer operands. In case if second pointer also needs
// to be replaced we defer processing of these intrinsics until all
// other values are processed.
DeferredIntrs.push_back(Intr);
continue;
}
case Intrinsic::memmove: {
MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getDestAlign(),
MemMove->getRawSource(), MemMove->getSourceAlign(),
MemMove->getLength(), MemMove->isVolatile());
Intr->eraseFromParent();
continue;
}
case Intrinsic::memset: {
MemSetInst *MemSet = cast<MemSetInst>(Intr);
Builder.CreateMemSet(
Expand Down Expand Up @@ -1050,6 +1043,27 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
llvm_unreachable("Don't know how to promote alloca intrinsic use.");
}
}

for (IntrinsicInst *Intr : DeferredIntrs) {
Builder.SetInsertPoint(Intr);
Intrinsic::ID ID = Intr->getIntrinsicID();
assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);

MemTransferInst *MI = cast<MemTransferInst>(Intr);
auto *B =
Builder.CreateMemTransferInst(ID, MI->getRawDest(), MI->getDestAlign(),
MI->getRawSource(), MI->getSourceAlign(),
MI->getLength(), MI->isVolatile());

for (unsigned I = 1; I != 3; ++I) {
if (uint64_t Bytes = Intr->getDereferenceableBytes(I)) {
B->addDereferenceableAttr(I, Bytes);
}
}

Intr->eraseFromParent();
}

return true;
}

Expand Down
32 changes: 32 additions & 0 deletions llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@

declare void @llvm.memcpy.p0i8.p1i8.i32(i8* nocapture, i8 addrspace(1)* nocapture, i32, i1) #0
declare void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* nocapture, i8* nocapture, i32, i1) #0
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) #0

declare void @llvm.memmove.p0i8.p1i8.i32(i8* nocapture, i8 addrspace(1)* nocapture, i32, i1) #0
declare void @llvm.memmove.p1i8.p0i8.i32(i8 addrspace(1)* nocapture, i8* nocapture, i32, i1) #0
declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) #0

declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i1) #0

Expand Down Expand Up @@ -61,5 +63,35 @@ define amdgpu_kernel void @promote_with_objectsize(i32 addrspace(1)* %out) #0 {
ret void
}

; CHECK-LABEL: @promote_alloca_used_twice_in_memcpy(
; CHECK: %i = bitcast double addrspace(3)* %arrayidx1 to i8 addrspace(3)*
; CHECK: %i1 = bitcast double addrspace(3)* %arrayidx2 to i8 addrspace(3)*
; CHECK: call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* align 8 dereferenceable(16) %i, i8 addrspace(3)* align 8 dereferenceable(16) %i1, i64 16, i1 false)
define amdgpu_kernel void @promote_alloca_used_twice_in_memcpy(i32 %c) {
entry:
%r = alloca double, align 8
%arrayidx1 = getelementptr inbounds double, double* %r, i32 1
%i = bitcast double* %arrayidx1 to i8*
%arrayidx2 = getelementptr inbounds double, double* %r, i32 %c
%i1 = bitcast double* %arrayidx2 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 dereferenceable(16) %i, i8* align 8 dereferenceable(16) %i1, i64 16, i1 false)
ret void
}

; CHECK-LABEL: @promote_alloca_used_twice_in_memmove(
; CHECK: %i = bitcast double addrspace(3)* %arrayidx1 to i8 addrspace(3)*
; CHECK: %i1 = bitcast double addrspace(3)* %arrayidx2 to i8 addrspace(3)*
; CHECK: call void @llvm.memmove.p3i8.p3i8.i64(i8 addrspace(3)* align 8 dereferenceable(16) %i, i8 addrspace(3)* align 8 dereferenceable(16) %i1, i64 16, i1 false)
define amdgpu_kernel void @promote_alloca_used_twice_in_memmove(i32 %c) {
entry:
%r = alloca double, align 8
%arrayidx1 = getelementptr inbounds double, double* %r, i32 1
%i = bitcast double* %arrayidx1 to i8*
%arrayidx2 = getelementptr inbounds double, double* %r, i32 %c
%i1 = bitcast double* %arrayidx2 to i8*
call void @llvm.memmove.p0i8.p0i8.i64(i8* align 8 dereferenceable(16) %i, i8* align 8 dereferenceable(16) %i1, i64 16, i1 false)
ret void
}

attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" }
attributes #1 = { nounwind readnone }

0 comments on commit cb41ee9

Please sign in to comment.