-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64] MI Scheduler LDP combine follow up #79003
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Sjoerd Meijer (sjoerdmeijer) ChangesThis is a follow up of 75d820d, adding more opcodes to the combine target hook enabling more LDP/STP creation. Patch co-authored by Cameron McInally. Full diff: https://github.com/llvm/llvm-project/pull/79003.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 42b7a6418032adc..fbdd5f85e7aae82 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4214,6 +4214,27 @@ static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
switch (FirstOpc) {
default:
return false;
+ case AArch64::STRSui:
+ case AArch64::STURSi:
+ return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi;
+ case AArch64::STRDui:
+ case AArch64::STURDi:
+ return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi;
+ case AArch64::STRQui:
+ case AArch64::STURQi:
+ return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi;
+ case AArch64::STRWui:
+ case AArch64::STURWi:
+ return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi;
+ case AArch64::STRXui:
+ case AArch64::STURXi:
+ return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi;
+ case AArch64::LDRSui:
+ case AArch64::LDURSi:
+ return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;
+ case AArch64::LDRDui:
+ case AArch64::LDURDi:
+ return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;
case AArch64::LDRQui:
case AArch64::LDURQi:
return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
@@ -4223,6 +4244,9 @@ static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
case AArch64::LDRSWui:
case AArch64::LDURSWi:
return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
+ case AArch64::LDRXui:
+ case AArch64::LDURXi:
+ return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;
}
// These instructions can't be paired based on their opcodes.
return false;
diff --git a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
index 83f86d1c3a7cbf3..8c7b31fd34c488b 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
@@ -1,5 +1,5 @@
; REQUIRES: asserts
-; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefixes=CHECK,CHECK-A57
; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=exynos-m3 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
; Test ldr clustering.
@@ -114,6 +114,22 @@ define <2 x i64> @ldq_cluster(ptr %p) {
ret <2 x i64> %res
}
+; CHECK: ********** MI Scheduling **********
+; CHECK: LDURSi_LDRSui:%bb.0 entry
+; CHECK: Cluster ld/st SU(3) - SU(4)
+; CHECK: SU(3): %3:fpr32 = LDURSi %0:gpr64
+; CHECK: SU(4): %4:fpr32 = LDRSui %0:gpr64
+;
+define void @LDURSi_LDRSui(ptr nocapture readonly %arg, ptr nocapture readonly %wa, ptr nocapture readonly %wb) {
+entry:
+ %r51 = getelementptr i8, ptr %arg, i64 -4
+ %r52 = load float, ptr %r51, align 4
+ %r53 = load float, ptr %arg, align 4
+ store float %r52, ptr %wa
+ store float %r53, ptr %wb
+ ret void
+}
+
; Test LDURQi / LDRQui clustering
;
; CHECK: ********** MI Scheduling **********
@@ -154,3 +170,130 @@ vector_body:
exit:
ret void
}
+
+; Test LDURDi / LDRDui clustering
+;
+; CHECK: ********** MI Scheduling **********
+; CHECK: LDURDi_LDRDui:%bb.1 vector_body
+;
+; CHECK: Cluster ld/st SU(2) - SU(6)
+; CHECK: Cluster ld/st SU(3) - SU(7)
+;
+; CHECK: SU(2): %{{[0-9]+}}:fpr64 = LDURDi
+; CHECK: SU(3): %{{[0-9]+}}:fpr64 = LDURDi
+; CHECK: SU(6): %{{[0-9]+}}:fpr64 = LDRDui
+; CHECK: SU(7): %{{[0-9]+}}:fpr64 = LDRDui
+;
+define void @LDURDi_LDRDui(ptr nocapture readonly %arg) {
+entry:
+ br label %vector_body
+vector_body:
+ %phi1 = phi ptr [ null, %entry ], [ %r63, %vector_body ]
+ %phi2 = phi ptr [ %arg, %entry ], [ %r62, %vector_body ]
+ %phi3 = phi i32 [ 0, %entry ], [ %r61, %vector_body ]
+ %r51 = getelementptr i8, ptr %phi1, i64 -8
+ %r52 = load <2 x float>, ptr %r51, align 8
+ %r53 = getelementptr i8, ptr %phi2, i64 -8
+ %r54 = load <2 x float>, ptr %r53, align 8
+ %r55 = fmul fast <2 x float> %r54, <float 3.0, float 4.0>
+ %r56 = fsub fast <2 x float> %r52, %r55
+ store <2 x float> %r56, ptr %r51, align 1
+ %r57 = load <2 x float>, ptr %phi1, align 8
+ %r58 = load <2 x float>, ptr %phi2, align 8
+ %r59 = fmul fast <2 x float> %r58, <float 3.0, float 4.0>
+ %r60 = fsub fast <2 x float> %r57, %r59
+ store <2 x float> %r60, ptr %phi1, align 1
+ %r61 = add i32 %phi3, 4
+ %r62 = getelementptr i8, ptr %phi2, i64 32
+ %r63 = getelementptr i8, ptr %phi1, i64 32
+ %r.not = icmp eq i32 %r61, 0
+ br i1 %r.not, label %exit, label %vector_body
+exit:
+ ret void
+}
+
+; CHECK: ********** MI Scheduling **********
+; CHECK: LDURXi_LDRXui:%bb.0 entry
+; CHECK: Cluster ld/st SU(3) - SU(4)
+; CHECK: SU(3): %{{[0-9]+}}:gpr64 = LDURXi
+; CHECK: SU(4): %{{[0-9]+}}:gpr64 = LDRXui
+;
+define void @LDURXi_LDRXui(ptr nocapture readonly %arg, ptr nocapture readonly %wa, ptr nocapture readonly %wb) {
+entry:
+ %r51 = getelementptr i8, ptr %arg, i64 -8
+ %r52 = load i64, ptr %r51, align 8
+ %r53 = load i64, ptr %arg, align 8
+ store i64 %r52, ptr %wa
+ store i64 %r53, ptr %wb
+ ret void
+}
+
+; CHECK: ********** MI Scheduling **********
+; CHECK: STURWi_STRWui:%bb.0 entry
+; CHECK: Cluster ld/st SU(3) - SU(4)
+; CHECK: SU(3): STURWi %{{[0-9]+}}:gpr32
+; CHECK: SU(4): STRWui %{{[0-9]+}}:gpr32
+;
+define void @STURWi_STRWui(ptr nocapture readonly %arg, i32 %b, i32 %c) {
+entry:
+ %r51 = getelementptr i8, ptr %arg, i64 -4
+ store i32 %b, ptr %r51
+ store i32 %c, ptr %arg
+ ret void
+}
+
+; CHECK: ********** MI Scheduling **********
+; CHECK: STURXi_STRXui:%bb.0 entry
+; CHECK: Cluster ld/st SU(3) - SU(4)
+; CHECK: SU(3): STURXi %{{[0-9]+}}:gpr64
+; CHECK: SU(4): STRXui %{{[0-9]+}}:gpr64
+;
+define void @STURXi_STRXui(ptr nocapture readonly %arg, i64 %b, i64 %c) {
+entry:
+ %r51 = getelementptr i8, ptr %arg, i64 -8
+ store i64 %b, ptr %r51
+ store i64 %c, ptr %arg
+ ret void
+}
+
+; CHECK-A57: ********** MI Scheduling **********
+; CHECK-A57: STURSi_STRSui:%bb.0 entry
+; CHECK-A57: Cluster ld/st SU(3) - SU(4)
+; CHECK-A57: SU(3): STURSi %{{[0-9]+}}:fpr32
+; CHECK-A57: SU(4): STRSui %{{[0-9]+}}:fpr32
+;
+define void @STURSi_STRSui(ptr nocapture readonly %arg, float %b, float %c) {
+entry:
+ %r51 = getelementptr i8, ptr %arg, i64 -4
+ store float %b, ptr %r51
+ store float %c, ptr %arg
+ ret void
+}
+
+; CHECK-A57: ********** MI Scheduling **********
+; CHECK-A57: STURDi_STRDui:%bb.0 entry
+; CHECK-A57: Cluster ld/st SU(3) - SU(4)
+; CHECK-A57: SU(3): STURDi %{{[0-9]+}}:fpr64
+; CHECK-A57: SU(4): STRDui %{{[0-9]+}}:fpr64
+;
+define void @STURDi_STRDui(ptr nocapture readonly %arg, <2 x float> %b, <2 x float> %c) {
+entry:
+ %r51 = getelementptr i8, ptr %arg, i64 -8
+ store <2 x float> %b, ptr %r51
+ store <2 x float> %c, ptr %arg
+ ret void
+}
+
+; CHECK-A57: ********** MI Scheduling **********
+; CHECK-A57: STURQi_STRQui:%bb.0 entry
+; CHECK-A57: Cluster ld/st SU(3) - SU(4)
+; CHECK-A57: SU(3): STURQi %{{[0-9]+}}:fpr128
+; CHECK-A57: SU(4): STRQui %{{[0-9]+}}:fpr128
+;
+define void @STURQi_STRQui(ptr nocapture readonly %arg, <2 x double> %b, <2 x double> %c) {
+entry:
+ %r51 = getelementptr i8, ptr %arg, i64 -16
+ store <2 x double> %b, ptr %r51
+ store <2 x double> %c, ptr %arg
+ ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
index 0a3476e5f4cef63..08ad34c7b03ba07 100644
--- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
@@ -1680,31 +1680,31 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst)
; CHECK-NEXT: add x9, x0, #8
; CHECK-NEXT: LBB17_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldp d2, d4, [x9, #-8]
+; CHECK-NEXT: ldp d2, d3, [x9, #-8]
; CHECK-NEXT: add x10, x1, x8
; CHECK-NEXT: ldp q6, q5, [x10, #32]
; CHECK-NEXT: add x8, x8, #128
; CHECK-NEXT: ldp q17, q16, [x10]
; CHECK-NEXT: cmp x8, #1024
-; CHECK-NEXT: tbl.16b v3, { v2 }, v1
+; CHECK-NEXT: tbl.16b v4, { v2 }, v1
; CHECK-NEXT: tbl.16b v2, { v2 }, v0
-; CHECK-NEXT: tbl.16b v7, { v4 }, v1
-; CHECK-NEXT: tbl.16b v4, { v4 }, v0
+; CHECK-NEXT: tbl.16b v7, { v3 }, v1
+; CHECK-NEXT: tbl.16b v3, { v3 }, v0
; CHECK-NEXT: add x9, x9, #16
-; CHECK-NEXT: uaddw2.2d v5, v5, v3
-; CHECK-NEXT: uaddw.2d v3, v6, v3
+; CHECK-NEXT: uaddw2.2d v5, v5, v4
+; CHECK-NEXT: uaddw.2d v4, v6, v4
; CHECK-NEXT: uaddw2.2d v6, v16, v2
; CHECK-NEXT: ldp q18, q16, [x10, #96]
; CHECK-NEXT: uaddw.2d v2, v17, v2
-; CHECK-NEXT: stp q3, q5, [x10, #32]
+; CHECK-NEXT: stp q4, q5, [x10, #32]
; CHECK-NEXT: ldp q17, q5, [x10, #64]
; CHECK-NEXT: uaddw2.2d v16, v16, v7
; CHECK-NEXT: uaddw.2d v7, v18, v7
; CHECK-NEXT: stp q2, q6, [x10]
-; CHECK-NEXT: uaddw2.2d v3, v5, v4
-; CHECK-NEXT: uaddw.2d v4, v17, v4
+; CHECK-NEXT: uaddw2.2d v4, v5, v3
+; CHECK-NEXT: uaddw.2d v3, v17, v3
; CHECK-NEXT: stp q7, q16, [x10, #96]
-; CHECK-NEXT: stp q4, q3, [x10, #64]
+; CHECK-NEXT: stp q3, q4, [x10, #64]
; CHECK-NEXT: b.ne LBB17_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
|
Thanks for looking into this. Do you have benchmark results? This is quite a few extra pairs, and there is a chance it can cause less scheduling freedom. |
This has been benchmarked, but I will repeat it for SPEC INT and FP just to double check. |
I have benchmarked SPEC INT and FP on the V2 and the N1, and performance is flat. I don't have access to in-order cores, but SPEC looks good on bigger ones. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you mind splitting the loads and stores into separate patches? I think this is OK, but it would be good to have it split out in case we find a problem with one or the other.
Sure, no problem, will do. |
dc9c5c1
to
4bd9a16
Compare
Sorry I don't think I got the update for the latest update. Thanks for splitting this out. It looks like the tests might be failing in CI, but otherwise this LGTM. |
Thanks Dave, will see what is going on with the test before committing. |
This is a follow up of 75d820d, adding more opcodes to the combine target hook enabling more LDP creation. Patch co-authored by Cameron McInally.
4bd9a16
to
d2b7358
Compare
The test failure was related to an unused CHECK prefix, caught in a build with asserts, so that's nice. It was a leftover from the STP part that I factored out, which I will now put up for review soon after merging this. |
Add opcodes for different store instructions to the target hook that can enable more STP pairs. This is split off from the patch that does the same for some load instructions (llvm#79003). Patch co-authored by Cameron McInally.
Add opcodes for different store instructions to the target hook that can enable more STP pairs. This is split off from the patch that does the same for some load instructions (#79003). Patch co-authored by Cameron McInally.
This is a follow up of 75d820d, adding more opcodes to the combine target hook enabling more LDP creation.
Patch co-authored by Cameron McInally.