-
Notifications
You must be signed in to change notification settings - Fork 15.5k
AMDGPU/PromoteAlloca: Fix handling of users of multiple allocas #172771
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU/PromoteAlloca: Fix handling of users of multiple allocas #172771
Conversation
With recent refactoring, LDS promotion worklists for all allocas are populated upfront. In some cases, this results in a User in multiple lists. Then as each list is processed, a User might get deleted via removeFromParent, potentially leaving a dangling pointer in a subsequent worklist. Currently this only occurs for memcpy and memmove. Prior to refactoring, these were handled by DeferredInstr, and were processed after the last use of the then singular worklist. This change moves processing of DeferredInstr to after all worklists have be processed.
|
@llvm/pr-subscribers-backend-amdgpu Author: None (macurtis-amd) ChangesWith recent refactoring, LDS promotion worklists for all allocas are populated upfront. In some cases, this results in a User in multiple lists. Then as each list is processed, a User might get deleted via removeFromParent, potentially leaving a dangling pointer in a subsequent worklist. Currently this only occurs for memcpy and memmove. Prior to refactoring, these were handled by DeferredInstr, and were processed after the last use of the then singular worklist. This change moves processing of DeferredInstr to after all worklists have be processed. Full diff: https://github.com/llvm/llvm-project/pull/172771.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 83b463c630d71..361a74d57c784 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -159,7 +159,10 @@ class AMDGPUPromoteAllocaImpl {
void analyzePromoteToVector(AllocaAnalysis &AA) const;
void promoteAllocaToVector(AllocaAnalysis &AA);
void analyzePromoteToLDS(AllocaAnalysis &AA) const;
- bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS);
+ bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS,
+ SmallVector<IntrinsicInst *> &DeferredIntrs);
+ void finishDeferredAllocaToLDSPromotion(
+ SmallVector<IntrinsicInst *> &DeferredIntrs);
void scoreAlloca(AllocaAnalysis &AA) const;
@@ -414,6 +417,7 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
// clang-format on
bool Changed = false;
+ SmallVector<IntrinsicInst *> DeferredIntrs;
for (AllocaAnalysis &AA : Allocas) {
if (AA.Vector.Ty) {
const unsigned AllocaCost =
@@ -435,9 +439,11 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
}
}
- if (AA.LDS.Enable && tryPromoteAllocaToLDS(AA, SufficientLDS))
+ if (AA.LDS.Enable &&
+ tryPromoteAllocaToLDS(AA, SufficientLDS, DeferredIntrs))
Changed = true;
}
+ finishDeferredAllocaToLDSPromotion(DeferredIntrs);
// NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
// dangling pointers. If we want to reuse it past this point, the loop above
@@ -1550,8 +1556,9 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
}
// FIXME: Should try to pick the most likely to be profitable allocas first.
-bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaAnalysis &AA,
- bool SufficientLDS) {
+bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(
+ AllocaAnalysis &AA, bool SufficientLDS,
+ SmallVector<IntrinsicInst *> &DeferredIntrs) {
LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << *AA.Alloca << '\n');
// Not likely to have sufficient local memory for promotion.
@@ -1620,8 +1627,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaAnalysis &AA,
AA.Alloca->replaceAllUsesWith(Offset);
AA.Alloca->eraseFromParent();
- SmallVector<IntrinsicInst *> DeferredIntrs;
-
PointerType *NewPtrTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS);
for (Value *V : AA.LDS.Worklist) {
@@ -1730,7 +1735,13 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaAnalysis &AA,
}
}
+ return true;
+}
+
+void AMDGPUPromoteAllocaImpl::finishDeferredAllocaToLDSPromotion(
+ SmallVector<IntrinsicInst *> &DeferredIntrs) {
for (IntrinsicInst *Intr : DeferredIntrs) {
+ IRBuilder<> Builder(Intr);
Builder.SetInsertPoint(Intr);
Intrinsic::ID ID = Intr->getIntrinsicID();
assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
@@ -1748,6 +1759,4 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaAnalysis &AA,
Intr->eraseFromParent();
}
-
- return true;
}
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-user-mult.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-user-mult.ll
new file mode 100644
index 0000000000000..915e0910a5047
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-user-mult.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=amdgpu-promote-alloca < %s | FileCheck %s
+
+; This tests the case where a memcpy has two pointer operands are promoted to LDS
+; See `@llvm.memcpy.p5.p5.i64(... %alloca1, ... %alloca, ...)` below.
+
+
+%struct.barney = type { i8, double }
+
+; Function Attrs: nofree norecurse noreturn nounwind memory(readwrite, target_mem0: none, target_mem1: none)
+define amdgpu_kernel void @zot() local_unnamed_addr #0 {
+; CHECK-LABEL: @zot(
+; CHECK-NEXT: bb:
+; CHECK-NEXT: [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load [[META0:![0-9]+]]
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1:![0-9]+]], !invariant.load [[META0]]
+; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP2]], 16
+; CHECK-NEXT: [[TMP6:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]]
+; CHECK-NEXT: [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]]
+; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]]
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x [[STRUCT_BARNEY:%.*]]], ptr addrspace(3) @zot.alloca, i32 0, i32 [[TMP13]]
+; CHECK-NEXT: [[TMP15:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP15]], i64 1
+; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(4) [[TMP16]], align 4, !invariant.load [[META0]]
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP15]], i64 2
+; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr addrspace(4) [[TMP18]], align 4, !range [[RNG1]], !invariant.load [[META0]]
+; CHECK-NEXT: [[TMP20:%.*]] = lshr i32 [[TMP17]], 16
+; CHECK-NEXT: [[TMP21:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP22:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP23:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP24:%.*]] = mul nuw nsw i32 [[TMP20]], [[TMP19]]
+; CHECK-NEXT: [[TMP25:%.*]] = mul i32 [[TMP24]], [[TMP21]]
+; CHECK-NEXT: [[TMP26:%.*]] = mul nuw nsw i32 [[TMP22]], [[TMP19]]
+; CHECK-NEXT: [[TMP27:%.*]] = add i32 [[TMP25]], [[TMP26]]
+; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[TMP27]], [[TMP23]]
+; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds [1024 x [[STRUCT_BARNEY]]], ptr addrspace(3) @zot.alloca1, i32 0, i32 [[TMP28]]
+; CHECK-NEXT: store i32 0, ptr addrspace(5) null, align 2147483648
+; CHECK-NEXT: call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) align 16 dereferenceable(16) [[TMP29]], ptr addrspace(3) align 16 dereferenceable(16) [[TMP14]], i64 16, i1 false)
+; CHECK-NEXT: call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) align 16 dereferenceable(16) [[TMP14]], ptr align 1 dereferenceable(16) poison, i64 16, i1 false)
+; CHECK-NEXT: [[LOAD:%.*]] = load volatile ptr, ptr addrspace(5) null, align 2147483648
+; CHECK-NEXT: br label [[BB2:%.*]]
+; CHECK: bb2:
+; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 1 dereferenceable(16) @hoge, ptr addrspace(3) align 16 dereferenceable(16) [[TMP29]], i64 16, i1 false)
+; CHECK-NEXT: br label [[BB2]]
+;
+bb:
+ %alloca = alloca %struct.barney, align 16, addrspace(5)
+ %alloca1 = alloca %struct.barney, align 16, addrspace(5)
+ store i32 0, ptr addrspace(5) null, align 2147483648
+ call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef align 16 dereferenceable(16) %alloca1, ptr addrspace(5) noundef align 16 dereferenceable(16) %alloca, i64 16, i1 false)
+ call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef align 16 dereferenceable(16) %alloca, ptr noundef nonnull align 1 dereferenceable(16) poison, i64 16, i1 false)
+ %load = load volatile ptr, ptr addrspace(5) null, align 2147483648
+ br label %bb2
+
+bb2: ; preds = %bb2, %bb
+ call void @llvm.memcpy.p0.p5.i64(ptr noundef nonnull align 1 dereferenceable(16) @hoge, ptr addrspace(5) noundef align 16 dereferenceable(16) %alloca1, i64 16, i1 false)
+ br label %bb2
+}
+
+declare ptr @hoge() local_unnamed_addr #1
+
+attributes #0 = { nofree norecurse noreturn nounwind memory(readwrite, target_mem0: none, target_mem1: none) "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+attributes #1 = { "uniform-work-group-size"="false" }
|
|
FWIW, I also have an alternative fix that is a bit more robust/general IMO: macurtis-amd@21a3e3c. It also allows processing of memcpy/memmove in the main switch along with the other intrinsics. |
|
Thanks for this fix. In your fix, when a |
PrasoonMishra
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See my comment for details; needs deduplication of DeferredIntrs.
Good catch. I've changed |
PrasoonMishra
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
ronlieb
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
thx for the fixes, looking forward to it landing soon
…#172771) With recent refactoring, LDS promotion worklists for all allocas are populated upfront. In some cases, this results in a User in multiple lists. Then as each list is processed, a User might get deleted via removeFromParent, potentially leaving a dangling pointer in a subsequent worklist. Currently this only occurs for memcpy and memmove. Prior to refactoring, these were handled by DeferredInstr, and were processed after the last use of the then singular worklist. This change moves processing of DeferredInstr to after all worklists have be processed.
…#172771) With recent refactoring, LDS promotion worklists for all allocas are populated upfront. In some cases, this results in a User in multiple lists. Then as each list is processed, a User might get deleted via removeFromParent, potentially leaving a dangling pointer in a subsequent worklist. Currently this only occurs for memcpy and memmove. Prior to refactoring, these were handled by DeferredInstr, and were processed after the last use of the then singular worklist. This change moves processing of DeferredInstr to after all worklists have be processed.
arsenm
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this be using ValueHandle / WeakVH?
| %struct.barney = type { i8, double } | ||
|
|
||
| ; Function Attrs: nofree norecurse noreturn nounwind memory(readwrite, target_mem0: none, target_mem1: none) | ||
| define amdgpu_kernel void @zot() local_unnamed_addr #0 { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| define amdgpu_kernel void @zot() local_unnamed_addr #0 { | |
| define amdgpu_kernel void @zot() #0 { |
|
|
||
| %struct.barney = type { i8, double } | ||
|
|
||
| ; Function Attrs: nofree norecurse noreturn nounwind memory(readwrite, target_mem0: none, target_mem1: none) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| ; Function Attrs: nofree norecurse noreturn nounwind memory(readwrite, target_mem0: none, target_mem1: none) |
| declare ptr @hoge() local_unnamed_addr #1 | ||
|
|
||
| attributes #0 = { nofree norecurse noreturn nounwind memory(readwrite, target_mem0: none, target_mem1: none) "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } | ||
| attributes #1 = { "uniform-work-group-size"="false" } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove unnecessary attributes (which is probably all of them)
…#172771) With recent refactoring, LDS promotion worklists for all allocas are populated upfront. In some cases, this results in a User in multiple lists. Then as each list is processed, a User might get deleted via removeFromParent, potentially leaving a dangling pointer in a subsequent worklist. Currently this only occurs for memcpy and memmove. Prior to refactoring, these were handled by DeferredInstr, and were processed after the last use of the then singular worklist. This change moves processing of DeferredInstr to after all worklists have be processed.
Remove unnecessary attributes in test case as requested in post-merge feedback (#172771).
With recent refactoring, LDS promotion worklists for all allocas are populated upfront. In some cases, this results in a User in multiple lists. Then as each list is processed, a User might get deleted via removeFromParent, potentially leaving a dangling pointer in a subsequent worklist.
Currently this only occurs for memcpy and memmove. Prior to refactoring, these were handled by DeferredInstr, and were processed after the last use of the then singular worklist.
This change moves processing of DeferredInstr to after all worklists have be processed.