-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AArch64] Optimize memcpy for non-power of two sizes #168890
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-aarch64 Author: clf (clingfei) ChangesThe previous getMemcpyLoadsAndStores implementation would chain load/store instructions from "NumLdStInMemcpy - GlueIter - GluedLdStLimit" to "NumLdStInMemcpy - GlueIter". This approach caused issues when copying non-power-of-two sizes, as it would chain leading load/stores with subsequent instructions at non-power-of-two aligned offsets. This chaining pattern prevented optimal optimizations in aarch64-ldst-opt pass for these load/store instructions. This commit modifies the chaining range to be from GlueIter to GlueIter + GluedLdStLimit, enabling proper optimization of load/store instructions in aarch64-ldst-opt. Closes #165947 Full diff: https://github.com/llvm/llvm-project/pull/168890.diff 2 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 1b15a207a2d37..9858c163a1534 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8746,8 +8746,8 @@ static SDValue getMemcpyLoadsAndStores(
unsigned GlueIter = 0;
for (unsigned cnt = 0; cnt < NumberLdChain; ++cnt) {
- unsigned IndexFrom = NumLdStInMemcpy - GlueIter - GluedLdStLimit;
- unsigned IndexTo = NumLdStInMemcpy - GlueIter;
+ unsigned IndexFrom = GlueIter;
+ unsigned IndexTo = GlueIter + GluedLdStLimit;
chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, IndexFrom, IndexTo,
OutLoadChains, OutStoreChains);
@@ -8756,8 +8756,8 @@ static SDValue getMemcpyLoadsAndStores(
// Residual ld/st.
if (RemainingLdStInMemcpy) {
- chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, 0,
- RemainingLdStInMemcpy, OutLoadChains,
+ chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, GlueIter,
+ NumLdStInMemcpy, OutLoadChains,
OutStoreChains);
}
}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-mops.ll b/llvm/test/CodeGen/AArch64/aarch64-mops.ll
index 1710fad9f2539..fc64ce7d26d0e 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-mops.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-mops.ll
@@ -1407,30 +1407,28 @@ define void @memcpy_inline_300(ptr %dst, ptr %src, i32 %value) {
;
; SDAG-WITHOUT-MOPS-O2-LABEL: memcpy_inline_300:
; SDAG-WITHOUT-MOPS-O2: // %bb.0: // %entry
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldp q1, q0, [x1, #16]
-; SDAG-WITHOUT-MOPS-O2-NEXT: add x8, x1, #284
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q2, [x1]
-; SDAG-WITHOUT-MOPS-O2-NEXT: stp q1, q0, [x0, #16]
-; SDAG-WITHOUT-MOPS-O2-NEXT: str q2, [x0]
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldp q1, q0, [x1, #80]
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldp q2, q3, [x1, #48]
-; SDAG-WITHOUT-MOPS-O2-NEXT: stp q1, q0, [x0, #80]
-; SDAG-WITHOUT-MOPS-O2-NEXT: stp q2, q3, [x0, #48]
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldp q1, q0, [x1, #144]
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldp q2, q3, [x1, #112]
-; SDAG-WITHOUT-MOPS-O2-NEXT: stp q1, q0, [x0, #144]
-; SDAG-WITHOUT-MOPS-O2-NEXT: stp q2, q3, [x0, #112]
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldp q1, q0, [x1, #208]
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldp q2, q3, [x1, #176]
-; SDAG-WITHOUT-MOPS-O2-NEXT: stp q1, q0, [x0, #208]
-; SDAG-WITHOUT-MOPS-O2-NEXT: stp q2, q3, [x0, #176]
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldp q3, q1, [x1, #256]
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q0, [x8]
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q2, [x1, #240]
-; SDAG-WITHOUT-MOPS-O2-NEXT: add x8, x0, #284
-; SDAG-WITHOUT-MOPS-O2-NEXT: str q0, [x8]
-; SDAG-WITHOUT-MOPS-O2-NEXT: stp q3, q1, [x0, #256]
-; SDAG-WITHOUT-MOPS-O2-NEXT: str q2, [x0, #240]
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldp q1, q0, [x1, #224]
+; SDAG-WITHOUT-MOPS-O2-NEXT: add x8, x1, #284
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldp q2, q3, [x1, #192]
+; SDAG-WITHOUT-MOPS-O2-NEXT: stp q1, q0, [x0, #224]
+; SDAG-WITHOUT-MOPS-O2-NEXT: stp q2, q3, [x0, #192]
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldp q1, q0, [x1, #160]
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldp q2, q3, [x1, #128]
+; SDAG-WITHOUT-MOPS-O2-NEXT: stp q1, q0, [x0, #160]
+; SDAG-WITHOUT-MOPS-O2-NEXT: stp q2, q3, [x0, #128]
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldp q1, q0, [x1, #96]
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldp q2, q3, [x1, #64]
+; SDAG-WITHOUT-MOPS-O2-NEXT: stp q1, q0, [x0, #96]
+; SDAG-WITHOUT-MOPS-O2-NEXT: stp q2, q3, [x0, #64]
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldp q1, q0, [x1, #32]
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldp q2, q3, [x1]
+; SDAG-WITHOUT-MOPS-O2-NEXT: stp q1, q0, [x0, #32]
+; SDAG-WITHOUT-MOPS-O2-NEXT: stp q2, q3, [x0]
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldp q2, q1, [x1, #256]
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q0, [x8]
+; SDAG-WITHOUT-MOPS-O2-NEXT: add x8, x0, #284
+; SDAG-WITHOUT-MOPS-O2-NEXT: str q0, [x8]
+; SDAG-WITHOUT-MOPS-O2-NEXT: stp q2, q1, [x0, #256]
; SDAG-WITHOUT-MOPS-O2-NEXT: ret
;
; SDAG-MOPS-O2-LABEL: memcpy_inline_300:
@@ -1536,46 +1534,46 @@ define void @memcpy_inline_300_volatile(ptr %dst, ptr %src, i32 %value) {
;
; SDAG-WITHOUT-MOPS-O2-LABEL: memcpy_inline_300_volatile:
; SDAG-WITHOUT-MOPS-O2: // %bb.0: // %entry
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q0, [x1]
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q1, [x1, #16]
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q2, [x1, #32]
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q3, [x1, #48]
-; SDAG-WITHOUT-MOPS-O2-NEXT: str q3, [x0, #48]
-; SDAG-WITHOUT-MOPS-O2-NEXT: str q2, [x0, #32]
-; SDAG-WITHOUT-MOPS-O2-NEXT: str q1, [x0, #16]
-; SDAG-WITHOUT-MOPS-O2-NEXT: str q0, [x0]
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q0, [x1, #64]
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q1, [x1, #80]
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q2, [x1, #96]
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q3, [x1, #112]
-; SDAG-WITHOUT-MOPS-O2-NEXT: str q3, [x0, #112]
-; SDAG-WITHOUT-MOPS-O2-NEXT: str q2, [x0, #96]
-; SDAG-WITHOUT-MOPS-O2-NEXT: str q1, [x0, #80]
-; SDAG-WITHOUT-MOPS-O2-NEXT: str q0, [x0, #64]
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q0, [x1, #128]
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q1, [x1, #144]
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q2, [x1, #160]
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q3, [x1, #176]
-; SDAG-WITHOUT-MOPS-O2-NEXT: str q3, [x0, #176]
-; SDAG-WITHOUT-MOPS-O2-NEXT: str q2, [x0, #160]
-; SDAG-WITHOUT-MOPS-O2-NEXT: str q1, [x0, #144]
-; SDAG-WITHOUT-MOPS-O2-NEXT: str q0, [x0, #128]
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q0, [x1, #192]
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q1, [x1, #208]
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q2, [x1, #224]
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q3, [x1, #240]
-; SDAG-WITHOUT-MOPS-O2-NEXT: str q3, [x0, #240]
-; SDAG-WITHOUT-MOPS-O2-NEXT: str q2, [x0, #224]
-; SDAG-WITHOUT-MOPS-O2-NEXT: str q1, [x0, #208]
-; SDAG-WITHOUT-MOPS-O2-NEXT: str q0, [x0, #192]
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q0, [x1, #256]
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q1, [x1, #272]
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldr x8, [x1, #288]
-; SDAG-WITHOUT-MOPS-O2-NEXT: ldr w9, [x1, #296]
-; SDAG-WITHOUT-MOPS-O2-NEXT: str w9, [x0, #296]
-; SDAG-WITHOUT-MOPS-O2-NEXT: str x8, [x0, #288]
-; SDAG-WITHOUT-MOPS-O2-NEXT: str q1, [x0, #272]
-; SDAG-WITHOUT-MOPS-O2-NEXT: str q0, [x0, #256]
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q0, [x1, #256]
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q1, [x1, #272]
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldr x8, [x1, #288]
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldr w9, [x1, #296]
+; SDAG-WITHOUT-MOPS-O2-NEXT: str w9, [x0, #296]
+; SDAG-WITHOUT-MOPS-O2-NEXT: str x8, [x0, #288]
+; SDAG-WITHOUT-MOPS-O2-NEXT: str q1, [x0, #272]
+; SDAG-WITHOUT-MOPS-O2-NEXT: str q0, [x0, #256]
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q0, [x1, #192]
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q1, [x1, #208]
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q2, [x1, #224]
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q3, [x1, #240]
+; SDAG-WITHOUT-MOPS-O2-NEXT: str q3, [x0, #240]
+; SDAG-WITHOUT-MOPS-O2-NEXT: str q2, [x0, #224]
+; SDAG-WITHOUT-MOPS-O2-NEXT: str q1, [x0, #208]
+; SDAG-WITHOUT-MOPS-O2-NEXT: str q0, [x0, #192]
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q0, [x1, #128]
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q1, [x1, #144]
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q2, [x1, #160]
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q3, [x1, #176]
+; SDAG-WITHOUT-MOPS-O2-NEXT: str q3, [x0, #176]
+; SDAG-WITHOUT-MOPS-O2-NEXT: str q2, [x0, #160]
+; SDAG-WITHOUT-MOPS-O2-NEXT: str q1, [x0, #144]
+; SDAG-WITHOUT-MOPS-O2-NEXT: str q0, [x0, #128]
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q0, [x1, #64]
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q1, [x1, #80]
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q2, [x1, #96]
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q3, [x1, #112]
+; SDAG-WITHOUT-MOPS-O2-NEXT: str q3, [x0, #112]
+; SDAG-WITHOUT-MOPS-O2-NEXT: str q2, [x0, #96]
+; SDAG-WITHOUT-MOPS-O2-NEXT: str q1, [x0, #80]
+; SDAG-WITHOUT-MOPS-O2-NEXT: str q0, [x0, #64]
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q0, [x1]
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q1, [x1, #16]
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q2, [x1, #32]
+; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q3, [x1, #48]
+; SDAG-WITHOUT-MOPS-O2-NEXT: str q3, [x0, #48]
+; SDAG-WITHOUT-MOPS-O2-NEXT: str q2, [x0, #32]
+; SDAG-WITHOUT-MOPS-O2-NEXT: str q1, [x0, #16]
+; SDAG-WITHOUT-MOPS-O2-NEXT: str q0, [x0]
; SDAG-WITHOUT-MOPS-O2-NEXT: ret
;
; SDAG-MOPS-O2-LABEL: memcpy_inline_300_volatile:
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
The previous getMemcpyLoadsAndStores implementation would chain load/store instructions from "NumLdStInMemcpy - GlueIter - GluedLdStLimit" to "NumLdStInMemcpy - GlueIter". This approach caused issues when copying non-power-of-two sizes, as it would chain leading load/stores with subsequent instructions at non-power-of-two aligned offsets. This chaining pattern prevented optimal optimizations in aarch64-ldst-opt pass for these load/store instructions. This commit modifies the chaining range to be from GlueIter to GlueIter + GluedLdStLimit, enabling proper optimization of load/store instructions in aarch64-ldst-opt.
🐧 Linux x64 Test Results
|
|
You should add some new tests based on the examples in the issue. This change also does not seem AArch64-specific. |
The previous getMemcpyLoadsAndStores implementation would chain load/store instructions from "NumLdStInMemcpy - GlueIter - GluedLdStLimit" to "NumLdStInMemcpy - GlueIter". This approach caused issues when copying non-power-of-two sizes, as it would chain leading load/stores with subsequent instructions at non-power-of-two aligned offsets.
This chaining pattern prevented optimal optimizations in aarch64-ldst-opt pass for these load/store instructions.
This commit modifies the chaining range to be from GlueIter to GlueIter + GluedLdStLimit, enabling proper optimization of load/store instructions in aarch64-ldst-opt.
Closes #165947