RE: [AMDGPU] InstCombine: fold invalid calls to amdgcn intrinsics into poison values…#196151
RE: [AMDGPU] InstCombine: fold invalid calls to amdgcn intrinsics into poison values…#196151yoonseoch wants to merge 1 commit into
Conversation
|
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-amdgpu Author: Yoonseo Choi (yoonseoch) ChangesReplace a call to amdgpu intrinsic into a poison value when the call is invalid because of "amdgpu-no-" attribute in the caller function. Upon #186925 (review) Reintroducing the logic with a fix of special casing kernels, which was reverted by Assisted by claude-4.6-sonnet-medium through CURSOR. Full diff: https://github.com/llvm/llvm-project/pull/196151.diff 8 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 463d63a88f690..d6d4b80c217af 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -749,6 +749,86 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
return std::nullopt;
}
+ case Intrinsic::amdgcn_dispatch_ptr: {
+ const Function *Fn = II.getFunction();
+ if (AMDGPU::isEntryFunctionCC(Fn->getCallingConv()))
+ return std::nullopt;
+ if (Fn->hasFnAttribute("amdgpu-no-dispatch-ptr"))
+ return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
+ return std::nullopt;
+ }
+ case Intrinsic::amdgcn_queue_ptr: {
+ const Function *Fn = II.getFunction();
+ if (AMDGPU::isEntryFunctionCC(Fn->getCallingConv()))
+ return std::nullopt;
+ if (Fn->hasFnAttribute("amdgpu-no-queue-ptr"))
+ return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
+ return std::nullopt;
+ }
+ case Intrinsic::amdgcn_dispatch_id: {
+ const Function *Fn = II.getFunction();
+ if (AMDGPU::isEntryFunctionCC(Fn->getCallingConv()))
+ return std::nullopt;
+ if (Fn->hasFnAttribute("amdgpu-no-dispatch-id"))
+ return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
+ return std::nullopt;
+ }
+ case Intrinsic::amdgcn_workgroup_id_x:
+ case Intrinsic::amdgcn_workgroup_id_y:
+ case Intrinsic::amdgcn_workgroup_id_z: {
+ // These attributes are only for non-kernel functions and don't-cares for
+ // kernels. Thus, a kernel having one attr and calling the corresponding
+ // intrinsic is allowed.
+ const Function *Fn = II.getFunction();
+ if (AMDGPU::isEntryFunctionCC(Fn->getCallingConv()))
+ return std::nullopt;
+
+ StringRef Attr =
+ IID == Intrinsic::amdgcn_workgroup_id_x ? "amdgpu-no-workgroup-id-x"
+ : IID == Intrinsic::amdgcn_workgroup_id_y ? "amdgpu-no-workgroup-id-y"
+ : "amdgpu-no-workgroup-id-z";
+ if (Fn->hasFnAttribute(Attr))
+ return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
+ return std::nullopt;
+ }
+ case Intrinsic::amdgcn_workitem_id_x:
+ case Intrinsic::amdgcn_workitem_id_y:
+ case Intrinsic::amdgcn_workitem_id_z: {
+ const Function *Fn = II.getFunction();
+ if (AMDGPU::isEntryFunctionCC(Fn->getCallingConv()))
+ return std::nullopt;
+
+ StringRef Attr =
+ IID == Intrinsic::amdgcn_workitem_id_x ? "amdgpu-no-workitem-id-x"
+ : IID == Intrinsic::amdgcn_workitem_id_y ? "amdgpu-no-workitem-id-y"
+ : "amdgpu-no-workitem-id-z";
+ if (Fn->hasFnAttribute(Attr))
+ return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
+ return std::nullopt;
+ }
+ case Intrinsic::amdgcn_lds_kernel_id: {
+ const Function *Fn = II.getFunction();
+ if (AMDGPU::isEntryFunctionCC(Fn->getCallingConv()))
+ return std::nullopt;
+ if (Fn->hasFnAttribute("amdgpu-no-lds-kernel-id"))
+ return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
+ return std::nullopt;
+ }
+ case Intrinsic::amdgcn_cluster_id_x:
+ case Intrinsic::amdgcn_cluster_id_y:
+ case Intrinsic::amdgcn_cluster_id_z: {
+ const Function *Fn = II.getFunction();
+ if (AMDGPU::isEntryFunctionCC(Fn->getCallingConv()))
+ return std::nullopt;
+
+ StringRef Attr =
+ IID == Intrinsic::amdgcn_cluster_id_x ? "amdgpu-no-cluster-id-x"
+ : IID == Intrinsic::amdgcn_cluster_id_y ? "amdgpu-no-cluster-id-y"
+ : "amdgpu-no-cluster-id-z";
+ if (Fn->hasFnAttribute(Attr))
+ return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
+ return std::nullopt;
+ }
case Intrinsic::amdgcn_rcp: {
Value *Src = II.getArgOperand(0);
if (isa<PoisonValue>(Src))
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.cluster.id.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.cluster.id.ll
new file mode 100644
index 0000000000000..3070498d12ca7
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.cluster.id.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes='instcombine' -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck %s --check-prefix=GFX1250
+
+; When amdgpu-no-cluster-id-x is set, calling the intrinsic is UB and the
+; call should be folded to poison on any target.
+define i32 @no_cluster_id_x() #0 {
+; GFX1250-LABEL: define i32 @no_cluster_id_x(
+; GFX1250-SAME: ) #[[ATTR0:[0-9]+]] {
+; GFX1250-NEXT: [[ENTRY:.*:]]
+; GFX1250-NEXT: ret i32 poison
+;
+entry:
+ %tmp = call i32 @llvm.amdgcn.cluster.id.x()
+ ret i32 %tmp
+}
+
+define i32 @no_cluster_id_y() #1 {
+; GFX1250-LABEL: define i32 @no_cluster_id_y(
+; GFX1250-SAME: ) #[[ATTR1:[0-9]+]] {
+; GFX1250-NEXT: [[ENTRY:.*:]]
+; GFX1250-NEXT: ret i32 poison
+;
+entry:
+ %tmp = call i32 @llvm.amdgcn.cluster.id.y()
+ ret i32 %tmp
+}
+
+define i32 @no_cluster_id_z() #2 {
+; GFX1250-LABEL: define i32 @no_cluster_id_z(
+; GFX1250-SAME: ) #[[ATTR2:[0-9]+]] {
+; GFX1250-NEXT: [[ENTRY:.*:]]
+; GFX1250-NEXT: ret i32 poison
+;
+entry:
+ %tmp = call i32 @llvm.amdgcn.cluster.id.z()
+ ret i32 %tmp
+}
+
+; Without the attribute, the call folds to poison on targets without the
+; cluster feature (gfx1100), but is kept on cluster-capable targets (gfx1250).
+define i32 @cluster_id_x_no_attr() {
+; GFX1250-LABEL: define i32 @cluster_id_x_no_attr(
+; GFX1250-SAME: ) #[[ATTR3:[0-9]+]] {
+; GFX1250-NEXT: [[ENTRY:.*:]]
+; GFX1250-NEXT: [[TMP:%.*]] = call i32 @llvm.amdgcn.cluster.id.x()
+; GFX1250-NEXT: ret i32 [[TMP]]
+;
+entry:
+ %tmp = call i32 @llvm.amdgcn.cluster.id.x()
+ ret i32 %tmp
+}
+
+declare i32 @llvm.amdgcn.cluster.id.x()
+declare i32 @llvm.amdgcn.cluster.id.y()
+declare i32 @llvm.amdgcn.cluster.id.z()
+
+attributes #0 = { "amdgpu-no-cluster-id-x" }
+attributes #1 = { "amdgpu-no-cluster-id-y" }
+attributes #2 = { "amdgpu-no-cluster-id-z" }
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.dispatch.id.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.dispatch.id.ll
new file mode 100644
index 0000000000000..d55beb67ca822
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.dispatch.id.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes='instcombine' -mtriple=amdgcn-amd-amdhsa < %s | FileCheck %s
+
+; When amdgpu-no-dispatch-id is set, calling the intrinsic is UB and the
+; call should be folded to poison.
+define i64 @no_dispatch_id() #0 {
+; CHECK-LABEL: define i64 @no_dispatch_id(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret i64 poison
+;
+entry:
+ %tmp = call i64 @llvm.amdgcn.dispatch.id()
+ ret i64 %tmp
+}
+
+; Without the attribute the call is kept.
+define i64 @dispatch_id_no_attr() {
+; CHECK-LABEL: define i64 @dispatch_id_no_attr() {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP:%.*]] = call i64 @llvm.amdgcn.dispatch.id()
+; CHECK-NEXT: ret i64 [[TMP]]
+;
+entry:
+ %tmp = call i64 @llvm.amdgcn.dispatch.id()
+ ret i64 %tmp
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i64 @llvm.amdgcn.dispatch.id()
+
+attributes #0 = { "amdgpu-no-dispatch-id" }
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.dispatch.ptr.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
new file mode 100644
index 0000000000000..74cd63be6c706
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes='instcombine' -mtriple=amdgcn-amd-amdhsa < %s | FileCheck %s
+
+; When amdgpu-no-dispatch-ptr is set, calling the intrinsic is UB and the
+; call should be folded to poison.
+define ptr addrspace(4) @no_dispatch_ptr() #0 {
+; CHECK-LABEL: define ptr addrspace(4) @no_dispatch_ptr(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret ptr addrspace(4) poison
+;
+entry:
+ %tmp = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+ ret ptr addrspace(4) %tmp
+}
+
+; When amdgpu-no-dispatch-ptr is set, a load from the intrinsic's returned
+; pointer is also UB. The call folds to poison and loading from poison yields
+; poison via existing InstCombine rules.
+define i32 @no_dispatch_ptr_load() #0 {
+; CHECK-LABEL: define i32 @no_dispatch_ptr_load(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: store i1 true, ptr poison, align 1
+; CHECK-NEXT: ret i32 poison
+;
+entry:
+ %ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+ %val = load i32, ptr addrspace(4) %ptr
+ ret i32 %val
+}
+
+; Without the attribute the call is kept.
+define ptr addrspace(4) @dispatch_ptr_no_attr() {
+; CHECK-LABEL: define ptr addrspace(4) @dispatch_ptr_no_attr() {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT: ret ptr addrspace(4) [[TMP]]
+;
+entry:
+ %tmp = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+ ret ptr addrspace(4) %tmp
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+
+attributes #0 = { "amdgpu-no-dispatch-ptr" }
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
new file mode 100644
index 0000000000000..c699b21c802f4
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes='instcombine' -mtriple=amdgcn-amd-amdhsa < %s | FileCheck %s
+
+; When amdgpu-no-lds-kernel-id is set, calling the intrinsic is UB and the
+; call should be folded to poison.
+define i32 @no_lds_kernel_id() #0 {
+; CHECK-LABEL: define i32 @no_lds_kernel_id(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret i32 poison
+;
+entry:
+ %tmp = call i32 @llvm.amdgcn.lds.kernel.id()
+ ret i32 %tmp
+}
+
+; Without the attribute the call is kept.
+define i32 @lds_kernel_id_no_attr() {
+; CHECK-LABEL: define i32 @lds_kernel_id_no_attr() {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: ret i32 [[TMP]]
+;
+entry:
+ %tmp = call i32 @llvm.amdgcn.lds.kernel.id()
+ ret i32 %tmp
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i32 @llvm.amdgcn.lds.kernel.id()
+
+attributes #0 = { "amdgpu-no-lds-kernel-id" }
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.queue.ptr.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.queue.ptr.ll
new file mode 100644
index 0000000000000..013e70e0b38d8
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.queue.ptr.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes='instcombine' -mtriple=amdgcn-amd-amdhsa < %s | FileCheck %s
+
+; When amdgpu-no-queue-ptr is set, calling the intrinsic is UB and the
+; call should be folded to poison.
+define ptr addrspace(4) @no_queue_ptr() #0 {
+; CHECK-LABEL: define ptr addrspace(4) @no_queue_ptr(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret ptr addrspace(4) poison
+;
+entry:
+ %tmp = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+ ret ptr addrspace(4) %tmp
+}
+
+; When amdgpu-no-queue-ptr is set, a load from the intrinsic's returned
+; pointer is also UB. The call folds to poison and loading from poison yields
+; poison via existing InstCombine rules.
+define i32 @no_queue_ptr_load() #0 {
+; CHECK-LABEL: define i32 @no_queue_ptr_load(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: store i1 true, ptr poison, align 1
+; CHECK-NEXT: ret i32 poison
+;
+entry:
+ %ptr = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+ %val = load i32, ptr addrspace(4) %ptr
+ ret i32 %val
+}
+
+; Without the attribute the call is kept.
+define ptr addrspace(4) @queue_ptr_no_attr() {
+; CHECK-LABEL: define ptr addrspace(4) @queue_ptr_no_attr() {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+; CHECK-NEXT: ret ptr addrspace(4) [[TMP]]
+;
+entry:
+ %tmp = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+ ret ptr addrspace(4) %tmp
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+
+attributes #0 = { "amdgpu-no-queue-ptr" }
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.workgroup.id.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.workgroup.id.ll
new file mode 100644
index 0000000000000..34efd980c3e30
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.workgroup.id.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes='instcombine' -mtriple=amdgcn-amd-amdhsa < %s | FileCheck %s
+
+; When amdgpu-no-workgroup-id-x is set, calling the intrinsic is UB and the
+; call should be folded to poison.
+define i32 @no_workgroup_id_x() #0 {
+; CHECK-LABEL: define i32 @no_workgroup_id_x(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret i32 poison
+;
+entry:
+ %tmp = call i32 @llvm.amdgcn.workgroup.id.x()
+ ret i32 %tmp
+}
+
+define i32 @no_workgroup_id_y() #1 {
+; CHECK-LABEL: define i32 @no_workgroup_id_y(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret i32 poison
+;
+entry:
+ %tmp = call i32 @llvm.amdgcn.workgroup.id.y()
+ ret i32 %tmp
+}
+
+define i32 @no_workgroup_id_z() #2 {
+; CHECK-LABEL: define i32 @no_workgroup_id_z(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret i32 poison
+;
+entry:
+ %tmp = call i32 @llvm.amdgcn.workgroup.id.z()
+ ret i32 %tmp
+}
+
+; Without the attribute the calls are kept.
+define i32 @workgroup_id_x_no_attr() {
+; CHECK-LABEL: define i32 @workgroup_id_x_no_attr() {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
+; CHECK-NEXT: ret i32 [[TMP]]
+;
+entry:
+ %tmp = call i32 @llvm.amdgcn.workgroup.id.x()
+ ret i32 %tmp
+}
+
+declare i32 @llvm.amdgcn.workgroup.id.x()
+declare i32 @llvm.amdgcn.workgroup.id.y()
+declare i32 @llvm.amdgcn.workgroup.id.z()
+
+attributes #0 = { "amdgpu-no-workgroup-id-x" }
+attributes #1 = { "amdgpu-no-workgroup-id-y" }
+attributes #2 = { "amdgpu-no-workgroup-id-z" }
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.workitem.id.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.workitem.id.ll
new file mode 100644
index 0000000000000..3972b9ae8d168
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.workitem.id.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes='instcombine' -mtriple=amdgcn-amd-amdhsa < %s | FileCheck %s
+
+; When amdgpu-no-workitem-id-x is set, calling the intrinsic is UB and the
+; call should be folded to poison.
+define i32 @no_workitem_id_x() #0 {
+; CHECK-LABEL: define i32 @no_workitem_id_x(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret i32 poison
+;
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ ret i32 %tmp
+}
+
+define i32 @no_workitem_id_y() #1 {
+; CHECK-LABEL: define i32 @no_workitem_id_y(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret i32 poison
+;
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.y()
+ ret i32 %tmp
+}
+
+define i32 @no_workitem_id_z() #2 {
+; CHECK-LABEL: define i32 @no_workitem_id_z(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret i32 poison
+;
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.z()
+ ret i32 %tmp
+}
+
+; Without the attribute the calls are kept.
+define i32 @workitem_id_x_no_attr() {
+; CHECK-LABEL: define i32 @workitem_id_x_no_attr() {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: ret i32 [[TMP]]
+;
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ ret i32 %tmp
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare i32 @llvm.amdgcn.workitem.id.y()
+declare i32 @llvm.amdgcn.workitem.id.z()
+
+attributes #0 = { "amdgpu-no-workitem-id-x" }
+attributes #1 = { "amdgpu-no-workitem-id-y" }
+attributes #2 = { "amdgpu-no-workitem-id-z" }
|
There was a problem hiding this comment.
Pull request overview
This PR reintroduces/extends AMDGPU-specific InstCombine folding so calls to certain llvm.amdgcn.* intrinsics are replaced with poison when the caller function is marked with the corresponding amdgpu-no-* attribute (i.e., the intrinsic use is considered invalid/UB under that contract). It also adds focused InstCombine tests for each intrinsic/attribute pair.
Changes:
- Teach
AMDGPUInstCombineIntrinsicto fold invalid calls toamdgcn_dispatch_ptr,amdgcn_queue_ptr,amdgcn_dispatch_id,amdgcn_workgroup_id_{x,y,z},amdgcn_workitem_id_{x,y,z},amdgcn_lds_kernel_id, andamdgcn_cluster_id_{x,y,z}intopoisonwhen the matchingamdgpu-no-*function attribute is present. - Add new InstCombine regression tests covering these fold-to-poison behaviors for non-entry functions.
- Add a target-feature-specific test for
cluster.id.*on a cluster-capable GPU.
Reviewed changes
Copilot reviewed 8 out of 8 changed files in this pull request and generated 5 comments.
Show a summary per file
| File | Description |
|---|---|
| llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | Adds InstCombine folds that replace invalid AMDGPU intrinsic calls with poison based on amdgpu-no-* attributes, with special-casing for entry functions. |
| llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.workitem.id.ll | New test coverage for folding workitem.id.{x,y,z} to poison under amdgpu-no-workitem-id-*. |
| llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.workgroup.id.ll | New test coverage for folding workgroup.id.{x,y,z} to poison under amdgpu-no-workgroup-id-*. |
| llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.queue.ptr.ll | New test coverage for folding queue.ptr to poison under amdgpu-no-queue-ptr. |
| llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.lds.kernel.id.ll | New test coverage for folding lds.kernel.id to poison under amdgpu-no-lds-kernel-id. |
| llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.dispatch.ptr.ll | New test coverage for folding dispatch.ptr to poison under amdgpu-no-dispatch-ptr. |
| llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.dispatch.id.ll | New test coverage for folding dispatch.id to poison under amdgpu-no-dispatch-id. |
| llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.cluster.id.ll | New test coverage for folding cluster.id.{x,y,z} to poison under amdgpu-no-cluster-id-* (currently only on a cluster-capable target). |
| case Intrinsic::amdgcn_dispatch_ptr: { | ||
| const Function *Fn = II.getFunction(); | ||
| if (AMDGPU::isEntryFunctionCC(Fn->getCallingConv())) | ||
| return std::nullopt; | ||
| if (Fn->hasFnAttribute("amdgpu-no-dispatch-ptr")) | ||
| return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType())); | ||
| return std::nullopt; | ||
| } | ||
| case Intrinsic::amdgcn_queue_ptr: { | ||
| const Function *Fn = II.getFunction(); | ||
| if (AMDGPU::isEntryFunctionCC(Fn->getCallingConv())) | ||
| return std::nullopt; | ||
| if (Fn->hasFnAttribute("amdgpu-no-queue-ptr")) | ||
| return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType())); | ||
| return std::nullopt; | ||
| } | ||
| case Intrinsic::amdgcn_dispatch_id: { | ||
| const Function *Fn = II.getFunction(); | ||
| if (AMDGPU::isEntryFunctionCC(Fn->getCallingConv())) | ||
| return std::nullopt; | ||
| if (Fn->hasFnAttribute("amdgpu-no-dispatch-id")) | ||
| return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType())); | ||
| return std::nullopt; | ||
| } |
| case Intrinsic::amdgcn_workgroup_id_x: | ||
| case Intrinsic::amdgcn_workgroup_id_y: | ||
| case Intrinsic::amdgcn_workgroup_id_z: { | ||
| // These attributes are only for non-kernel functions and don't-cares for | ||
| // kernels. Thus, a kernel having one attr and calling the corresponding | ||
| // intrinsic is allowed. | ||
| const Function *Fn = II.getFunction(); | ||
| if (AMDGPU::isEntryFunctionCC(Fn->getCallingConv())) | ||
| return std::nullopt; | ||
|
|
||
| StringRef Attr = | ||
| IID == Intrinsic::amdgcn_workgroup_id_x ? "amdgpu-no-workgroup-id-x" | ||
| : IID == Intrinsic::amdgcn_workgroup_id_y ? "amdgpu-no-workgroup-id-y" | ||
| : "amdgpu-no-workgroup-id-z"; | ||
| if (Fn->hasFnAttribute(Attr)) | ||
| return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType())); | ||
| return std::nullopt; | ||
| } | ||
| case Intrinsic::amdgcn_workitem_id_x: | ||
| case Intrinsic::amdgcn_workitem_id_y: | ||
| case Intrinsic::amdgcn_workitem_id_z: { | ||
| const Function *Fn = II.getFunction(); | ||
| if (AMDGPU::isEntryFunctionCC(Fn->getCallingConv())) | ||
| return std::nullopt; | ||
|
|
||
| StringRef Attr = | ||
| IID == Intrinsic::amdgcn_workitem_id_x ? "amdgpu-no-workitem-id-x" | ||
| : IID == Intrinsic::amdgcn_workitem_id_y ? "amdgpu-no-workitem-id-y" | ||
| : "amdgpu-no-workitem-id-z"; | ||
| if (Fn->hasFnAttribute(Attr)) | ||
| return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType())); | ||
| return std::nullopt; | ||
| } |
| case Intrinsic::amdgcn_cluster_id_x: | ||
| case Intrinsic::amdgcn_cluster_id_y: | ||
| case Intrinsic::amdgcn_cluster_id_z: { | ||
| const Function *Fn = II.getFunction(); | ||
| if (AMDGPU::isEntryFunctionCC(Fn->getCallingConv())) | ||
| return std::nullopt; | ||
|
|
||
| StringRef Attr = | ||
| IID == Intrinsic::amdgcn_cluster_id_x ? "amdgpu-no-cluster-id-x" | ||
| : IID == Intrinsic::amdgcn_cluster_id_y ? "amdgpu-no-cluster-id-y" | ||
| : "amdgpu-no-cluster-id-z"; | ||
| if (Fn->hasFnAttribute(Attr)) | ||
| return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType())); | ||
| return std::nullopt; |
| ; Without the attribute, the call folds to poison on targets without the | ||
| ; cluster feature (gfx1100), but is kept on cluster-capable targets (gfx1250). | ||
| define i32 @cluster_id_x_no_attr() { | ||
| ; GFX1250-LABEL: define i32 @cluster_id_x_no_attr( | ||
| ; GFX1250-SAME: ) #[[ATTR3:[0-9]+]] { | ||
| ; GFX1250-NEXT: [[ENTRY:.*:]] | ||
| ; GFX1250-NEXT: [[TMP:%.*]] = call i32 @llvm.amdgcn.cluster.id.x() | ||
| ; GFX1250-NEXT: ret i32 [[TMP]] | ||
| ; |
| ; When amdgpu-no-dispatch-ptr is set, calling the intrinsic is UB and the | ||
| ; call should be folded to poison. | ||
| define ptr addrspace(4) @no_dispatch_ptr() #0 { | ||
| ; CHECK-LABEL: define ptr addrspace(4) @no_dispatch_ptr( | ||
| ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { | ||
| ; CHECK-NEXT: [[ENTRY:.*:]] | ||
| ; CHECK-NEXT: ret ptr addrspace(4) poison | ||
| ; | ||
| entry: | ||
| %tmp = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() | ||
| ret ptr addrspace(4) %tmp | ||
| } |
|
I'm not really sure it's a good idea to special-case the semantics of those attributes. In other words, when the attribute appears on a kernel entry function, it means nothing, but when it appears on a regular device function, it suddenly has meaning. IMO, we should have consistent behavior and avoid giving the same attribute different meanings depending on where it appears. |
There was a problem hiding this comment.
The pattern is a little repetitive. I'd suggest adding
static StringRef getAMDGPUConflictFuncAttrForIntrinsic(Intrinsic::ID IID) {
switch (IID) {
case Intrinsic::amdgcn_dispatch_ptr:
return "amdgpu-no-dispatch-ptr";
case Intrinsic::amdgcn_queue_ptr:
return "amdgpu-no-queue-ptr";
case Intrinsic::amdgcn_dispatch_id:
return "amdgpu-no-dispatch-id";
case Intrinsic::amdgcn_workgroup_id_x:
return "amdgpu-no-workgroup-id-x";
case Intrinsic::amdgcn_workgroup_id_y:
return "amdgpu-no-workgroup-id-y";
case Intrinsic::amdgcn_workgroup_id_z:
return "amdgpu-no-workgroup-id-z";
case Intrinsic::amdgcn_workitem_id_x:
return "amdgpu-no-workitem-id-x";
case Intrinsic::amdgcn_workitem_id_y:
return "amdgpu-no-workitem-id-y";
case Intrinsic::amdgcn_workitem_id_z:
return "amdgpu-no-workitem-id-z";
case Intrinsic::amdgcn_lds_kernel_id:
return "amdgpu-no-lds-kernel-id";
case Intrinsic::amdgcn_cluster_id_x:
return "amdgpu-no-cluster-id-x";
case Intrinsic::amdgcn_cluster_id_y:
return "amdgpu-no-cluster-id-y";
case Intrinsic::amdgcn_cluster_id_z:
return "amdgpu-no-cluster-id-z";
default:
llvm_unreachable("unexpected intrinsic");
return StringRef();
}
}
static std::optional<Instruction *>
getPoisonedAMDGPUIntrinsicReplacement(Instruction &II, InstCombiner &IC) {
Intrinsic::ID IID = II.getIntrinsicID();
StringRef ConflictAttr = getAMDGPUConflictFuncAttrForIntrinsic(IID);
const Function *Fn = II.getFunction();
if (!AMDGPU::isEntryFunctionCC(Fn->getCallingConv()) &&
Fn->hasFnAttribute(ConflictAttr))
return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
return std::nullopt;
}and then change this to
case Intrinsic::amdgcn_dispatch_ptr:
case Intrinsic::amdgcn_queue_ptr:
case Intrinsic::amdgcn_dispatch_id:
case Intrinsic::amdgcn_workgroup_id_x:
case Intrinsic::amdgcn_workgroup_id_y:
case Intrinsic::amdgcn_workgroup_id_z:
case Intrinsic::amdgcn_workitem_id_x:
case Intrinsic::amdgcn_workitem_id_y:
case Intrinsic::amdgcn_workitem_id_z:
case Intrinsic::amdgcn_lds_kernel_id:
case Intrinsic::amdgcn_cluster_id_x:
case Intrinsic::amdgcn_cluster_id_y:
case Intrinsic::amdgcn_cluster_id_z:
return getPoisonedAMDGPUIntrinsicReplacement(II, IC);
Replace a call to amdgpu intrinsic into a poison value when the call is invalid because of "amdgpu-no-" attribute in the caller function.
Upon #186925 (review)
Reintroducing the logic with a fix of special casing kernels, which was reverted by
#192514 (Original: #191904)
Assisted by claude-4.6-sonnet-medium through CURSOR.