AArch64: Optimize memmove for non-power-of-two sizes #168633

osamakader · 2025-11-19T00:00:06Z

Fixes #165948

llvmbot · 2025-11-19T00:00:41Z

@llvm/pr-subscribers-backend-aarch64

Author: Osama Abdelkader (osamakader)

Changes

Fixes #165948

Patch is 21.36 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168633.diff

4 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+49)
(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.h (+5)
(modified) llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp (+323)
(modified) llvm/test/CodeGen/AArch64/memmove-inline.ll (+98)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8f41f230b5521..43f570a76b10b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18472,6 +18472,55 @@ EVT AArch64TargetLowering::getOptimalMemOpType(
   return MVT::Other;
 }
 
+bool AArch64TargetLowering::findOptimalMemOpLowering(
+    LLVMContext &Context, std::vector<EVT> &MemOps, unsigned Limit,
+    const MemOp &Op, unsigned DstAS, unsigned SrcAS,
+    const AttributeList &FuncAttributes) const {
+  // For memmove with specific non-power-of-two sizes (5-7, 9-15, 17-23, 25-31,
+  // 33-47, 49-63, 65), we want to use overlapping operations handled by
+  // EmitTargetCodeForMemmove. Return false here to prevent the generic
+  // expansion, so EmitTargetCodeForMemmove gets called.
+  // Note: We check !Op.isMemset() to identify memmove (memmove allows overlap
+  // by definition, even if the generic code sets allowOverlap=false due to
+  // volatility concerns)
+  if (!Op.isMemset()) {
+    uint64_t Size = Op.size();
+    // Only handle non-power-of-two sizes > 4 and <= 65 where we have custom
+    // overlapping logic. Sizes that are multiples of 8 (8, 16, 24, 32, etc.)
+    // are handled efficiently by the generic code.
+    bool HandledSize = Size > 4 && Size <= 65 && (Size % 8 != 0);
+    
+    if (HandledSize) {
+      auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
+        if (Op.isAligned(AlignCheck))
+          return true;
+        unsigned Fast;
+        return allowsMisalignedMemoryAccesses(VT, DstAS, Align(1),
+                                              MachineMemOperand::MONone, &Fast) &&
+               Fast;
+      };
+      
+      // Check if we can use the appropriate type for this size range
+      bool CanHandle = false;
+      if (Size >= 5 && Size <= 7) {
+        CanHandle = AlignmentIsAcceptable(MVT::i32, Align(1));
+      } else if (Size >= 9 && Size <= 23) {
+        CanHandle = AlignmentIsAcceptable(MVT::i64, Align(1));
+      } else if (Size >= 25 && Size <= 65) {
+        CanHandle = AlignmentIsAcceptable(MVT::v16i8, Align(1)) &&
+                    AlignmentIsAcceptable(MVT::i64, Align(1));
+      }
+      
+      if (CanHandle)
+        return false;
+    }
+  }
+
+  // Otherwise, use the default implementation
+  return TargetLowering::findOptimalMemOpLowering(Context, MemOps, Limit, Op,
+                                                  DstAS, SrcAS, FuncAttributes);
+}
+
 LLT AArch64TargetLowering::getOptimalMemOpLLT(
     const MemOp &Op, const AttributeList &FuncAttributes) const {
   bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index be198e54cbcbf..d6b1e63e689c2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -258,6 +258,11 @@ class AArch64TargetLowering : public TargetLowering {
   EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op,
                           const AttributeList &FuncAttributes) const override;
 
+  bool findOptimalMemOpLowering(LLVMContext &Context, std::vector<EVT> &MemOps,
+                                unsigned Limit, const MemOp &Op, unsigned DstAS,
+                                unsigned SrcAS,
+                                const AttributeList &FuncAttributes) const override;
+
   LLT getOptimalMemOpLLT(const MemOp &Op,
                          const AttributeList &FuncAttributes) const override;
 
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 48e03ad853d26..19505cf3952cf 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -252,6 +252,329 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemmove(
   if (LowerToSMERoutines && !Attrs.hasNonStreamingInterfaceAndBody())
     return EmitStreamingCompatibleMemLibCall(DAG, dl, Chain, Dst, Src, Size,
                                              RTLIB::MEMMOVE);
+
+  // Handle small memmove cases with overlapping loads/stores for better codegen
+  // For non-power-of-two sizes, use overlapping operations instead of
+  // mixed-size operations (e.g., for 7 bytes: two i32 loads/stores with overlap
+  // instead of i32 + i16 + i8)
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Size)) {
+    uint64_t SizeVal = C->getZExtValue();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    
+    auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
+      if (Alignment >= AlignCheck)
+        return true;
+      unsigned Fast;
+      return TLI.allowsMisalignedMemoryAccesses(VT, DstPtrInfo.getAddrSpace(),
+                                                  Align(1),
+                                                  MachineMemOperand::MONone,
+                                                  &Fast) &&
+             Fast;
+    };
+
+    MachineMemOperand::Flags MMOFlags =
+        isVolatile ? MachineMemOperand::MOVolatile
+                   : MachineMemOperand::MONone;
+    
+    // For sizes 5-7 bytes: use two overlapping i32 operations
+    if (SizeVal >= 5 && SizeVal <= 7) {
+      if (AlignmentIsAcceptable(MVT::i32, Align(1))) {
+        uint64_t SecondOffset = SizeVal - 4;
+        
+        SDValue Load1 = DAG.getLoad(
+            MVT::i32, dl, Chain,
+            DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)),
+            SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+        
+        SDValue Load2 = DAG.getLoad(
+            MVT::i32, dl, Chain,
+            DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SecondOffset)),
+            SrcPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags);
+        
+        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                           Load1.getValue(1), Load2.getValue(1));
+        
+        SDValue Store1 = DAG.getStore(
+            Chain, dl, Load1,
+            DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)),
+            DstPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+        
+        SDValue Store2 = DAG.getStore(
+            Chain, dl, Load2,
+            DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(SecondOffset)),
+            DstPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags);
+        
+        return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
+      }
+    }
+    
+    // For sizes 9-15 bytes: use i64 + overlapping i64
+    if (SizeVal >= 9 && SizeVal <= 15) {
+      if (AlignmentIsAcceptable(MVT::i64, Align(1))) {
+        uint64_t SecondOffset = SizeVal - 8;
+        
+        SDValue Load1 = DAG.getLoad(
+            MVT::i64, dl, Chain,
+            DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)),
+            SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+        
+        SDValue Load2 = DAG.getLoad(
+            MVT::i64, dl, Chain,
+            DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SecondOffset)),
+            SrcPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags);
+        
+        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                           Load1.getValue(1), Load2.getValue(1));
+        
+        SDValue Store1 = DAG.getStore(
+            Chain, dl, Load1,
+            DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)),
+            DstPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+        
+        SDValue Store2 = DAG.getStore(
+            Chain, dl, Load2,
+            DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(SecondOffset)),
+            DstPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags);
+        
+        return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
+      }
+    }
+    
+    // For sizes 17-23 bytes: use i64 + i64 + overlapping i64
+    if (SizeVal >= 17 && SizeVal <= 23) {
+      if (AlignmentIsAcceptable(MVT::i64, Align(1))) {
+        uint64_t ThirdOffset = SizeVal - 8;
+        
+        SDValue Load1 = DAG.getLoad(
+            MVT::i64, dl, Chain,
+            DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)),
+            SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+        
+        SDValue Load2 = DAG.getLoad(
+            MVT::i64, dl, Chain,
+            DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(8)),
+            SrcPtrInfo.getWithOffset(8), Alignment, MMOFlags);
+        
+        SDValue Load3 = DAG.getLoad(
+            MVT::i64, dl, Chain,
+            DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(ThirdOffset)),
+            SrcPtrInfo.getWithOffset(ThirdOffset), Alignment, MMOFlags);
+        
+        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                           Load1.getValue(1), Load2.getValue(1), Load3.getValue(1));
+        
+        SDValue Store1 = DAG.getStore(
+            Chain, dl, Load1,
+            DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)),
+            DstPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+        
+        SDValue Store2 = DAG.getStore(
+            Chain, dl, Load2,
+            DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(8)),
+            DstPtrInfo.getWithOffset(8), Alignment, MMOFlags);
+        
+        SDValue Store3 = DAG.getStore(
+            Chain, dl, Load3,
+            DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(ThirdOffset)),
+            DstPtrInfo.getWithOffset(ThirdOffset), Alignment, MMOFlags);
+        
+        return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2, Store3);
+      }
+    }
+    
+    // For sizes 25-31 bytes: use v16i8 (vector) + overlapping i64
+    if (SizeVal >= 25 && SizeVal <= 31) {
+      if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) &&
+          AlignmentIsAcceptable(MVT::i64, Align(1))) {
+        uint64_t SecondOffset = SizeVal - 8;
+        
+        SDValue Load1 = DAG.getLoad(
+            MVT::v16i8, dl, Chain,
+            DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)),
+            SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+        
+        SDValue Load2 = DAG.getLoad(
+            MVT::i64, dl, Chain,
+            DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SecondOffset)),
+            SrcPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags);
+        
+        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                           Load1.getValue(1), Load2.getValue(1));
+        
+        SDValue Store1 = DAG.getStore(
+            Chain, dl, Load1,
+            DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)),
+            DstPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+        
+        SDValue Store2 = DAG.getStore(
+            Chain, dl, Load2,
+            DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(SecondOffset)),
+            DstPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags);
+        
+        return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
+      }
+    }
+    
+    // For sizes 33-47 bytes: use 2 x v16i8 (vectors) + overlapping i64
+    if (SizeVal >= 33 && SizeVal <= 47) {
+      if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) &&
+          AlignmentIsAcceptable(MVT::i64, Align(1))) {
+        uint64_t ThirdOffset = SizeVal - 8;
+        
+        SDValue Load1 = DAG.getLoad(
+            MVT::v16i8, dl, Chain,
+            DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)),
+            SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+        
+        SDValue Load2 = DAG.getLoad(
+            MVT::v16i8, dl, Chain,
+            DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(16)),
+            SrcPtrInfo.getWithOffset(16), Alignment, MMOFlags);
+        
+        SDValue Load3 = DAG.getLoad(
+            MVT::i64, dl, Chain,
+            DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(ThirdOffset)),
+            SrcPtrInfo.getWithOffset(ThirdOffset), Alignment, MMOFlags);
+        
+        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                           Load1.getValue(1), Load2.getValue(1), Load3.getValue(1));
+        
+        SDValue Store1 = DAG.getStore(
+            Chain, dl, Load1,
+            DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)),
+            DstPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+        
+        SDValue Store2 = DAG.getStore(
+            Chain, dl, Load2,
+            DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(16)),
+            DstPtrInfo.getWithOffset(16), Alignment, MMOFlags);
+        
+        SDValue Store3 = DAG.getStore(
+            Chain, dl, Load3,
+            DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(ThirdOffset)),
+            DstPtrInfo.getWithOffset(ThirdOffset), Alignment, MMOFlags);
+        
+        return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2, Store3);
+      }
+    }
+    
+    // For sizes 49-63 bytes: use 3 x v16i8 (vectors) + overlapping i64
+    if (SizeVal >= 49 && SizeVal <= 63) {
+      if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) &&
+          AlignmentIsAcceptable(MVT::i64, Align(1))) {
+        uint64_t FourthOffset = SizeVal - 8;
+        
+        SDValue Load1 = DAG.getLoad(
+            MVT::v16i8, dl, Chain,
+            DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)),
+            SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+        
+        SDValue Load2 = DAG.getLoad(
+            MVT::v16i8, dl, Chain,
+            DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(16)),
+            SrcPtrInfo.getWithOffset(16), Alignment, MMOFlags);
+        
+        SDValue Load3 = DAG.getLoad(
+            MVT::v16i8, dl, Chain,
+            DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(32)),
+            SrcPtrInfo.getWithOffset(32), Alignment, MMOFlags);
+        
+        SDValue Load4 = DAG.getLoad(
+            MVT::i64, dl, Chain,
+            DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(FourthOffset)),
+            SrcPtrInfo.getWithOffset(FourthOffset), Alignment, MMOFlags);
+        
+        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                           Load1.getValue(1), Load2.getValue(1), Load3.getValue(1),
+                           Load4.getValue(1));
+        
+        SDValue Store1 = DAG.getStore(
+            Chain, dl, Load1,
+            DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)),
+            DstPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+        
+        SDValue Store2 = DAG.getStore(
+            Chain, dl, Load2,
+            DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(16)),
+            DstPtrInfo.getWithOffset(16), Alignment, MMOFlags);
+        
+        SDValue Store3 = DAG.getStore(
+            Chain, dl, Load3,
+            DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(32)),
+            DstPtrInfo.getWithOffset(32), Alignment, MMOFlags);
+        
+        SDValue Store4 = DAG.getStore(
+            Chain, dl, Load4,
+            DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(FourthOffset)),
+            DstPtrInfo.getWithOffset(FourthOffset), Alignment, MMOFlags);
+        
+        return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2, Store3, Store4);
+      }
+    }
+    
+    // For size 65 bytes: use 4 x v16i8 (vectors) + overlapping i64
+    if (SizeVal == 65) {
+      if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) &&
+          AlignmentIsAcceptable(MVT::i64, Align(1))) {
+        SDValue Load1 = DAG.getLoad(
+            MVT::v16i8, dl, Chain,
+            DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)),
+            SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+        
+        SDValue Load2 = DAG.getLoad(
+            MVT::v16i8, dl, Chain,
+            DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(16)),
+            SrcPtrInfo.getWithOffset(16), Alignment, MMOFlags);
+        
+        SDValue Load3 = DAG.getLoad(
+            MVT::v16i8, dl, Chain,
+            DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(32)),
+            SrcPtrInfo.getWithOffset(32), Alignment, MMOFlags);
+        
+        SDValue Load4 = DAG.getLoad(
+            MVT::v16i8, dl, Chain,
+            DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(48)),
+            SrcPtrInfo.getWithOffset(48), Alignment, MMOFlags);
+        
+        SDValue Load5 = DAG.getLoad(
+            MVT::i64, dl, Chain,
+            DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(57)),
+            SrcPtrInfo.getWithOffset(57), Alignment, MMOFlags);
+        
+        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                           Load1.getValue(1), Load2.getValue(1), Load3.getValue(1),
+                           Load4.getValue(1), Load5.getValue(1));
+        
+        SDValue Store1 = DAG.getStore(
+            Chain, dl, Load1,
+            DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)),
+            DstPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+        
+        SDValue Store2 = DAG.getStore(
+            Chain, dl, Load2,
+            DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(16)),
+            DstPtrInfo.getWithOffset(16), Alignment, MMOFlags);
+        
+        SDValue Store3 = DAG.getStore(
+            Chain, dl, Load3,
+            DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(32)),
+            DstPtrInfo.getWithOffset(32), Alignment, MMOFlags);
+        
+        SDValue Store4 = DAG.getStore(
+            Chain, dl, Load4,
+            DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(48)),
+            DstPtrInfo.getWithOffset(48), Alignment, MMOFlags);
+        
+        SDValue Store5 = DAG.getStore(
+            Chain, dl, Load5,
+            DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(57)),
+            DstPtrInfo.getWithOffset(57), Alignment, MMOFlags);
+        
+        return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2, Store3, Store4, Store5);
+      }
+    }
+  }
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/AArch64/memmove-inline.ll b/llvm/test/CodeGen/AArch64/memmove-inline.ll
index 641c48dd0f1c5..0ece0feda9da8 100644
--- a/llvm/test/CodeGen/AArch64/memmove-inline.ll
+++ b/llvm/test/CodeGen/AArch64/memmove-inline.ll
@@ -119,4 +119,102 @@ entry:
   ret void
 }
 
+; Test overlapping memmove optimization for non-power-of-two sizes
+; These should use overlapping loads/stores instead of mixed-size operations
+
+define void @move7(ptr %out, ptr %in) {
+; CHECK-ALIGNED-LABEL: move7:
+; CHECK-ALIGNED:       // %bb.0: // %entry
+; CHECK-ALIGNED-NEXT:    ldur w8, [x1, #3]
+; CHECK-ALIGNED-NEXT:    ldr w9, [x1]
+; CHECK-ALIGNED-NEXT:    stur w8, [x0, #3]
+; CHECK-ALIGNED-NEXT:    str w9, [x0]
+; CHECK-ALIGNED-NEXT:    ret
+entry:
+  call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 7, i1 false)
+  ret void
+}
+
+define void @move13(ptr %out, ptr %in) {
+; CHECK-ALIGNED-LABEL: move13:
+; CHECK-ALIGNED:       // %bb.0: // %entry
+; CHECK-ALIGNED-NEXT:    ldur x8, [x1, #5]
+; CHECK-ALIGNED-NEXT:    ldr x9, [x1]
+; CHECK-ALIGNED-NEXT:    stur x8, [x0, #5]
+; CHECK-ALIGNED-NEXT:    str x9, [x0]
+; CHECK-ALIGNED-NEXT:    ret
+entry:
+  call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 13, i1 false)
+  ret void
+}
+
+define void @move15(ptr %out, ptr %in) {
+; CHECK-ALIGNED-LABEL: move15:
+; CHECK-ALIGNED:       // %bb.0: // %entry
+; CHECK-ALIGNED-NEXT:    ldur x8, [x1, #7]
+; CHECK-ALIGNED-NEXT:    ldr x9, [x1]
+; CHECK-ALIGNED-NEXT:    stur x8, [x0, #7]
+; CHECK-ALIGNED-NEXT:    str x9, [x0]
+; CHECK-ALIGNED-NEXT:    ret
+entry:
+  call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 15, i1 false)
+  ret void
+}
+
+define void @move25(ptr %out, ptr %in) {
+; CHECK-ALIGNED-LABEL: move25:
+; CHECK-ALIGNED:       // %bb.0: // %entry
+; CHECK-ALIGNED-NEXT:    ldur x8, [x1, #17]
+; CHECK-ALIGNED-NEXT:    ldr q0, [x1]
+; CHECK-ALIGNED-NEXT:    stur x8, [x0, #17]
+; CHECK-ALIGNED-NEXT:    str q0, [x0]
+; CHECK-ALIGNED-NEXT:    ret
+entry:
+  call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 25, i1 false)
+  ret void
+}
+
+define void @move33(ptr %out, ptr %in) {
+; CHECK-ALIGNED-LABEL: move33:
+; CHECK-ALIGNED:       // %bb.0: // %entry
+; CHECK-ALIGNED-NEXT:    ldp q...
[truncated]

github-actions · 2025-11-19T00:01:58Z

✅ With the latest revision this PR passed the C/C++ code formatter.

github-actions · 2025-11-19T00:32:31Z

🐧 Linux x64 Test Results

186429 tests passed
4863 tests skipped

…g loads/stores This change improves memmove code generation for non-power-of-two sizes on AArch64 by using overlapping loads/stores instead of mixed-size operations, matching GCC's approach. For example, for a 7-byte memmove: - Before: ldrb + ldrh + ldr (3 loads, 3 stores) - After: ldur w8, [x1, llvm#3] + ldr w9, [x1] (2 loads, 2 stores) The optimization handles sizes 5-65 bytes that are not multiples of 8: - 5-7 bytes: two overlapping i32 operations - 9-15 bytes: two overlapping i64 operations - 17-23 bytes: two i64 + one overlapping i64 - 25-31 bytes: one v16i8 vector + one overlapping i64 - 33-47 bytes: two v16i8 vectors + one overlapping i64 - 49-63 bytes: three v16i8 vectors + one overlapping i64 - 65 bytes: four v16i8 vectors + one overlapping i64 This addresses issue llvm#165948 where LLVM generated suboptimal code compared to GCC for non-power-of-two memmove sizes. Signed-off-by: Osama Abdelkader <osama.abdelkader@gmail.com>

nasherm

I like the idea of this patch. But I have some comments

The codegen from this patch doesn't seem that much better for testcases move33, move49, and move65 https://godbolt.org/z/nEYvE5YY7

nasherm · 2025-11-20T11:15:39Z

llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp

+        isVolatile ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
+
+    // For sizes 5-7 bytes: use two overlapping i32 operations
+    if (SizeVal >= 5 && SizeVal <= 7) {


The pattern in these if blocks seems to be repeating the same core logic. Is there anyway way you can generalize it to a function? Something that takes the SizeVal as a parameter and generates the DAG node from that?

nasherm · 2025-11-20T11:17:09Z

llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp

+    }
+
+    // For sizes 33-47 bytes: use 2 x v16i8 (vectors) + overlapping i64
+    if (SizeVal >= 33 && SizeVal <= 47) {


Looking at the codegen here https://godbolt.org/z/nEYvE5YY7

It doesn't seem like we're getting much of an improvement for cases where SizeVal > 32

Actually, we have optimizations for some sizes, e.g.

Size 35: 3 loads/3 stores (optimized) vs 4 loads/4 stores (unoptimized)
Size 47: 3 loads/3 stores (optimized) vs 5 loads/5 stores (unoptimized)

nasherm · 2025-11-20T11:21:45Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+    const AttributeList &FuncAttributes) const {
+  if (!Op.isMemset() && !Op.allowOverlap()) {
+    uint64_t Size = Op.size();
+    bool HandledSize = (Size >= 5 && Size <= 7) ||


This can just use some binary arithmetic. Something like bool HandledSize = (Size & (Size - 1))

- Refactored repetitive code - Simplified size check: Using binary arithmetic (Size & (Size - 1)) != 0 - Updated comments Signed-off-by: Osama Abdelkader <osama.abdelkader@gmail.com>

llvmbot added the backend:AArch64 label Nov 19, 2025

osamakader force-pushed the fix/aarch64-memmove-opt branch from c3b31b3 to b54109c Compare November 19, 2025 21:12

nasherm self-requested a review November 20, 2025 11:05

nasherm requested changes Nov 20, 2025

View reviewed changes

AArch64: Optimize memmove for non-power-of-two sizes refactoring

9356a2a

- Refactored repetitive code - Simplified size check: Using binary arithmetic (Size & (Size - 1)) != 0 - Updated comments Signed-off-by: Osama Abdelkader <osama.abdelkader@gmail.com>

osamakader force-pushed the fix/aarch64-memmove-opt branch from 3562902 to 9356a2a Compare November 20, 2025 17:08

osamakader requested a review from nasherm November 20, 2025 17:08

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

AArch64: Optimize memmove for non-power-of-two sizes #168633

AArch64: Optimize memmove for non-power-of-two sizes #168633

Uh oh!

osamakader commented Nov 19, 2025

Uh oh!

llvmbot commented Nov 19, 2025

Uh oh!

github-actions bot commented Nov 19, 2025 •

edited

Loading

Uh oh!

github-actions bot commented Nov 19, 2025 •

edited

Loading

Uh oh!

nasherm left a comment

Uh oh!

nasherm Nov 20, 2025

Uh oh!

osamakader Nov 20, 2025

Uh oh!

nasherm Nov 20, 2025

Uh oh!

osamakader Nov 20, 2025

Uh oh!

nasherm Nov 20, 2025

Uh oh!

osamakader Nov 20, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

AArch64: Optimize memmove for non-power-of-two sizes #168633

Are you sure you want to change the base?

AArch64: Optimize memmove for non-power-of-two sizes #168633

Uh oh!

Conversation

osamakader commented Nov 19, 2025

Uh oh!

llvmbot commented Nov 19, 2025

Uh oh!

github-actions bot commented Nov 19, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented Nov 19, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

🐧 Linux x64 Test Results

Uh oh!

nasherm left a comment

Choose a reason for hiding this comment

Uh oh!

nasherm Nov 20, 2025

Choose a reason for hiding this comment

Uh oh!

osamakader Nov 20, 2025

Choose a reason for hiding this comment

Uh oh!

nasherm Nov 20, 2025

Choose a reason for hiding this comment

Uh oh!

osamakader Nov 20, 2025

Choose a reason for hiding this comment

Uh oh!

nasherm Nov 20, 2025

Choose a reason for hiding this comment

Uh oh!

osamakader Nov 20, 2025

Choose a reason for hiding this comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

github-actions bot commented Nov 19, 2025 •

edited

Loading

github-actions bot commented Nov 19, 2025 •

edited

Loading