-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Move libcall simplify into PeepholeEP #88853
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesWe were running this immediately on the incoming IR, which Full diff: https://github.com/llvm/llvm-project/pull/88853.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index f7e552177d6f50..305a6c8c3b9262 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -655,9 +655,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(
PB.registerPipelineStartEPCallback(
[](ModulePassManager &PM, OptimizationLevel Level) {
FunctionPassManager FPM;
- FPM.addPass(AMDGPUUseNativeCallsPass());
- if (EnableLibCallSimplify && Level != OptimizationLevel::O0)
- FPM.addPass(AMDGPUSimplifyLibCallsPass());
PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
if (EnableHipStdPar)
PM.addPass(HipStdParAcceleratorCodeSelectionPass());
@@ -681,6 +678,16 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(
PM.addPass(AMDGPUAlwaysInlinePass());
});
+ PB.registerPeepholeEPCallback(
+ [](FunctionPassManager &FPM, OptimizationLevel Level) {
+ if (Level == OptimizationLevel::O0)
+ return;
+
+ FPM.addPass(AMDGPUUseNativeCallsPass());
+ if (EnableLibCallSimplify)
+ FPM.addPass(AMDGPUSimplifyLibCallsPass());
+ });
+
PB.registerCGSCCOptimizerLateEPCallback(
[this](CGSCCPassManager &PM, OptimizationLevel Level) {
if (Level == OptimizationLevel::O0)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-libcall-sincos-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-libcall-sincos-pass-ordering.ll
new file mode 100644
index 00000000000000..4ae4b1031f47d9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-libcall-sincos-pass-ordering.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -O1 -amdgpu-prelink %s | FileCheck %s
+
+; Make sure that sin+cos -> sincos simplification happens after
+; initial IR simplifications, otherwise we can't identify the common
+; argument value.
+
+@.str = private unnamed_addr addrspace(4) constant [21 x i8] c"x: %f, y: %f, z: %f\0A\00", align 1
+
+; Should have call to sincos declarations, not calls to the asm pseudo-libcalls
+define protected amdgpu_kernel void @sincosTest(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, float noundef %x) #0 {
+; CHECK-LABEL: define protected amdgpu_kernel void @sincosTest(
+; CHECK-SAME: ptr addrspace(1) nocapture writeonly [[OUT0:%.*]], ptr addrspace(1) nocapture writeonly [[OUT1:%.*]], ptr addrspace(1) nocapture writeonly [[OUT2:%.*]], float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
+; CHECK-NEXT: [[I_I:%.*]] = call float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]]) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT: [[I_I2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
+; CHECK-NEXT: [[ADD:%.*]] = fadd float [[I_I]], [[I_I2]]
+; CHECK-NEXT: [[CONV:%.*]] = fpext float [[X]] to double
+; CHECK-NEXT: [[CONV5:%.*]] = fpext float [[ADD]] to double
+; CHECK-NEXT: store double [[CONV]], ptr addrspace(1) [[OUT0]], align 8
+; CHECK-NEXT: store double [[CONV5]], ptr addrspace(1) [[OUT1]], align 8
+; CHECK-NEXT: store double [[CONV5]], ptr addrspace(1) [[OUT2]], align 8
+; CHECK-NEXT: ret void
+;
+entry:
+ %x.addr = alloca float, align 4, addrspace(5)
+ %y = alloca float, align 4, addrspace(5)
+ %z = alloca float, align 4, addrspace(5)
+ store float %x, ptr addrspace(5) %x.addr, align 4
+ call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %y)
+ %i = load float, ptr addrspace(5) %x.addr, align 4
+ %call = call float @_Z3sinf(float noundef %i) #3
+ %i1 = load float, ptr addrspace(5) %x.addr, align 4
+ %call1 = call float @_Z3cosf(float noundef %i1) #3
+ %add = fadd float %call, %call1
+ store float %add, ptr addrspace(5) %y, align 4
+ call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %z)
+ %i2 = load float, ptr addrspace(5) %x.addr, align 4
+ %call2 = call float @_Z3cosf(float noundef %i2) #3
+ %i3 = load float, ptr addrspace(5) %x.addr, align 4
+ %call3 = call float @_Z3sinf(float noundef %i3) #3
+ %add4 = fadd float %call2, %call3
+ store float %add4, ptr addrspace(5) %z, align 4
+ %i4 = load float, ptr addrspace(5) %x.addr, align 4
+ %conv = fpext float %i4 to double
+ %i5 = load float, ptr addrspace(5) %y, align 4
+ %conv5 = fpext float %i5 to double
+ %i6 = load float, ptr addrspace(5) %z, align 4
+ %conv6 = fpext float %i6 to double
+ store double %conv, ptr addrspace(1) %out0, align 8
+ store double %conv5, ptr addrspace(1) %out1, align 8
+ store double %conv6, ptr addrspace(1) %out2, align 8
+ call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %z)
+ call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %y)
+ ret void
+}
+
+declare void @llvm.lifetime.start.p5(i64 immarg, ptr addrspace(5) nocapture) #1
+declare void @llvm.lifetime.end.p5(i64 immarg, ptr addrspace(5) nocapture) #1
+
+define internal float @_Z3cosf(float noundef %arg) #2 {
+bb:
+ %i = tail call float asm "pseudo-libcall-cos %0, %1", "=v,v"(float noundef %arg) #2
+ ret float %i
+}
+
+define internal float @_Z3sinf(float noundef %arg) #2 {
+bb:
+ %i = tail call float asm "pseudo-libcall-sin %0, %1", "=v,v"(float noundef %arg) #2
+ ret float %i
+}
+
+attributes #0 = { norecurse nounwind }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { mustprogress nofree norecurse nounwind willreturn memory(none) }
+attributes #3 = { nounwind willreturn memory(none) }
|
We were running this immediately on the incoming IR, which is still littered with temporary allocas obscuring trivial values. This needs to run after initial SROA to handle sincos insertion. Fixes: SWDEV-456865
84aaed7
to
0adb3eb
Compare
declare void @llvm.lifetime.start.p5(i64 immarg, ptr addrspace(5) nocapture) #1 | ||
declare void @llvm.lifetime.end.p5(i64 immarg, ptr addrspace(5) nocapture) #1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we still need intrinsic declarations?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Kind of no, but it's still canonically printed
attributes #0 = { norecurse nounwind } | ||
attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } | ||
attributes #2 = { mustprogress nofree norecurse nounwind willreturn memory(none) } | ||
attributes #3 = { nounwind willreturn memory(none) } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need all these attributes?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We need enough to get the asm calls deleted
call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %z) | ||
call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %y) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
are the lifetime intrinsics actually needed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
They're more representative of actual IR the pass will see, which is part of the point
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This fixes the issue I was having with my sincos() example. Thanks!
We were running this immediately on the incoming IR, which is still littered with temporary allocas obscuring trivial values. This needs to run after initial SROA to handle sincos insertion. Cherry-pick bc3620d for SWDEV-436409 Change-Id: Ifa1945ff5b7eb2d3a60f68aca923f05882b03b4a
We were running this immediately on the incoming IR, which
is still littered with temporary allocas obscuring trivial values.
This needs to run after initial SROA to handle sincos insertion.