From b54109cc70596dab091c804ff2ae685a1404560d Mon Sep 17 00:00:00 2001 From: Osama Abdelkader Date: Wed, 19 Nov 2025 01:56:10 +0200 Subject: [PATCH 1/2] AArch64: Optimize memmove for non-power-of-two sizes using overlapping loads/stores This change improves memmove code generation for non-power-of-two sizes on AArch64 by using overlapping loads/stores instead of mixed-size operations, matching GCC's approach. For example, for a 7-byte memmove: - Before: ldrb + ldrh + ldr (3 loads, 3 stores) - After: ldur w8, [x1, #3] + ldr w9, [x1] (2 loads, 2 stores) The optimization handles sizes 5-65 bytes that are not multiples of 8: - 5-7 bytes: two overlapping i32 operations - 9-15 bytes: two overlapping i64 operations - 17-23 bytes: two i64 + one overlapping i64 - 25-31 bytes: one v16i8 vector + one overlapping i64 - 33-47 bytes: two v16i8 vectors + one overlapping i64 - 49-63 bytes: three v16i8 vectors + one overlapping i64 - 65 bytes: four v16i8 vectors + one overlapping i64 This addresses issue #165948 where LLVM generated suboptimal code compared to GCC for non-power-of-two memmove sizes. Signed-off-by: Osama Abdelkader --- .../Target/AArch64/AArch64ISelLowering.cpp | 46 +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h | 6 + .../AArch64/AArch64SelectionDAGInfo.cpp | 326 ++++++++++++++++++ llvm/test/CodeGen/AArch64/memmove-inline.ll | 98 ++++++ 4 files changed, 476 insertions(+) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 8f41f230b5521..2460921050229 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -18502,6 +18502,52 @@ LLT AArch64TargetLowering::getOptimalMemOpLLT( return LLT(); } +bool AArch64TargetLowering::findOptimalMemOpLowering( + LLVMContext &Context, std::vector &MemOps, unsigned Limit, + const MemOp &Op, unsigned DstAS, unsigned SrcAS, + const AttributeList &FuncAttributes) const { + if (!Op.isMemset() && !Op.allowOverlap()) { + uint64_t Size = Op.size(); + bool HandledSize = (Size >= 5 && Size <= 7) || + (Size == 9) || + (Size >= 11 && Size <= 15) || + (Size >= 17 && Size <= 23) || + (Size >= 25 && Size <= 31) || + (Size >= 33 && Size <= 47) || + (Size >= 49 && Size <= 63) || + (Size == 65); + + if (HandledSize) { + auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) { + if (Op.isAligned(AlignCheck)) + return true; + unsigned Fast; + return allowsMisalignedMemoryAccesses( + VT, DstAS, Align(1), MachineMemOperand::MONone, &Fast) && + Fast; + }; + + // Check if we can use the appropriate type for this size range + bool CanHandle = false; + if (Size >= 5 && Size <= 7) { + CanHandle = AlignmentIsAcceptable(MVT::i32, Align(1)); + } else if (Size >= 9 && Size <= 23) { + CanHandle = AlignmentIsAcceptable(MVT::i64, Align(1)); + } else if (Size >= 25 && Size <= 65) { + CanHandle = AlignmentIsAcceptable(MVT::v16i8, Align(1)) && + AlignmentIsAcceptable(MVT::i64, Align(1)); + } + + if (CanHandle) + return false; + } + } + + // Otherwise, use the default implementation + return TargetLowering::findOptimalMemOpLowering(Context, MemOps, Limit, Op, + DstAS, SrcAS, FuncAttributes); +} + // 12-bit optionally shifted immediates are legal for adds. bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const { if (Immed == std::numeric_limits::min()) { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index be198e54cbcbf..4748835c47938 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -258,6 +258,12 @@ class AArch64TargetLowering : public TargetLowering { EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override; + bool + findOptimalMemOpLowering(LLVMContext &Context, std::vector &MemOps, + unsigned Limit, const MemOp &Op, unsigned DstAS, + unsigned SrcAS, + const AttributeList &FuncAttributes) const override; + LLT getOptimalMemOpLLT(const MemOp &Op, const AttributeList &FuncAttributes) const override; diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index 48e03ad853d26..60d20672c46cb 100644 --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -252,6 +252,332 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemmove( if (LowerToSMERoutines && !Attrs.hasNonStreamingInterfaceAndBody()) return EmitStreamingCompatibleMemLibCall(DAG, dl, Chain, Dst, Src, Size, RTLIB::MEMMOVE); + + // Handle small memmove cases with overlapping loads/stores for better codegen + // For non-power-of-two sizes, use overlapping operations instead of + // mixed-size operations (e.g., for 7 bytes: two i32 loads/stores with overlap + // instead of i32 + i16 + i8) + if (ConstantSDNode *C = dyn_cast(Size)) { + uint64_t SizeVal = C->getZExtValue(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) { + if (Alignment >= AlignCheck) + return true; + unsigned Fast; + return TLI.allowsMisalignedMemoryAccesses( + VT, DstPtrInfo.getAddrSpace(), Align(1), + MachineMemOperand::MONone, &Fast) && + Fast; + }; + + MachineMemOperand::Flags MMOFlags = + isVolatile ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone; + + // For sizes 5-7 bytes: use two overlapping i32 operations + if (SizeVal >= 5 && SizeVal <= 7) { + if (AlignmentIsAcceptable(MVT::i32, Align(1))) { + uint64_t SecondOffset = SizeVal - 4; + + SDValue Load1 = + DAG.getLoad(MVT::i32, dl, Chain, + DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)), + SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags); + + SDValue Load2 = DAG.getLoad( + MVT::i32, dl, Chain, + DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SecondOffset)), + SrcPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags); + + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1), + Load2.getValue(1)); + + SDValue Store1 = + DAG.getStore(Chain, dl, Load1, + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)), + DstPtrInfo.getWithOffset(0), Alignment, MMOFlags); + + SDValue Store2 = DAG.getStore( + Chain, dl, Load2, + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(SecondOffset)), + DstPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags); + + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2); + } + } + + // For sizes 9-15 bytes: use i64 + overlapping i64 + if (SizeVal >= 9 && SizeVal <= 15) { + if (AlignmentIsAcceptable(MVT::i64, Align(1))) { + uint64_t SecondOffset = SizeVal - 8; + + SDValue Load1 = + DAG.getLoad(MVT::i64, dl, Chain, + DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)), + SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags); + + SDValue Load2 = DAG.getLoad( + MVT::i64, dl, Chain, + DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SecondOffset)), + SrcPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags); + + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1), + Load2.getValue(1)); + + SDValue Store1 = + DAG.getStore(Chain, dl, Load1, + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)), + DstPtrInfo.getWithOffset(0), Alignment, MMOFlags); + + SDValue Store2 = DAG.getStore( + Chain, dl, Load2, + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(SecondOffset)), + DstPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags); + + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2); + } + } + + // For sizes 17-23 bytes: use i64 + i64 + overlapping i64 + if (SizeVal >= 17 && SizeVal <= 23) { + if (AlignmentIsAcceptable(MVT::i64, Align(1))) { + uint64_t ThirdOffset = SizeVal - 8; + + SDValue Load1 = + DAG.getLoad(MVT::i64, dl, Chain, + DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)), + SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags); + + SDValue Load2 = + DAG.getLoad(MVT::i64, dl, Chain, + DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(8)), + SrcPtrInfo.getWithOffset(8), Alignment, MMOFlags); + + SDValue Load3 = DAG.getLoad( + MVT::i64, dl, Chain, + DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(ThirdOffset)), + SrcPtrInfo.getWithOffset(ThirdOffset), Alignment, MMOFlags); + + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1), + Load2.getValue(1), Load3.getValue(1)); + + SDValue Store1 = + DAG.getStore(Chain, dl, Load1, + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)), + DstPtrInfo.getWithOffset(0), Alignment, MMOFlags); + + SDValue Store2 = + DAG.getStore(Chain, dl, Load2, + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(8)), + DstPtrInfo.getWithOffset(8), Alignment, MMOFlags); + + SDValue Store3 = DAG.getStore( + Chain, dl, Load3, + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(ThirdOffset)), + DstPtrInfo.getWithOffset(ThirdOffset), Alignment, MMOFlags); + + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2, + Store3); + } + } + + // For sizes 25-31 bytes: use v16i8 (vector) + overlapping i64 + if (SizeVal >= 25 && SizeVal <= 31) { + if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) && + AlignmentIsAcceptable(MVT::i64, Align(1))) { + uint64_t SecondOffset = SizeVal - 8; + + SDValue Load1 = + DAG.getLoad(MVT::v16i8, dl, Chain, + DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)), + SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags); + + SDValue Load2 = DAG.getLoad( + MVT::i64, dl, Chain, + DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SecondOffset)), + SrcPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags); + + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1), + Load2.getValue(1)); + + SDValue Store1 = + DAG.getStore(Chain, dl, Load1, + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)), + DstPtrInfo.getWithOffset(0), Alignment, MMOFlags); + + SDValue Store2 = DAG.getStore( + Chain, dl, Load2, + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(SecondOffset)), + DstPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags); + + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2); + } + } + + // For sizes 33-47 bytes: use 2 x v16i8 (vectors) + overlapping i64 + if (SizeVal >= 33 && SizeVal <= 47) { + if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) && + AlignmentIsAcceptable(MVT::i64, Align(1))) { + uint64_t ThirdOffset = SizeVal - 8; + + SDValue Load1 = + DAG.getLoad(MVT::v16i8, dl, Chain, + DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)), + SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags); + + SDValue Load2 = + DAG.getLoad(MVT::v16i8, dl, Chain, + DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(16)), + SrcPtrInfo.getWithOffset(16), Alignment, MMOFlags); + + SDValue Load3 = DAG.getLoad( + MVT::i64, dl, Chain, + DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(ThirdOffset)), + SrcPtrInfo.getWithOffset(ThirdOffset), Alignment, MMOFlags); + + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1), + Load2.getValue(1), Load3.getValue(1)); + + SDValue Store1 = + DAG.getStore(Chain, dl, Load1, + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)), + DstPtrInfo.getWithOffset(0), Alignment, MMOFlags); + + SDValue Store2 = DAG.getStore( + Chain, dl, Load2, + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(16)), + DstPtrInfo.getWithOffset(16), Alignment, MMOFlags); + + SDValue Store3 = DAG.getStore( + Chain, dl, Load3, + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(ThirdOffset)), + DstPtrInfo.getWithOffset(ThirdOffset), Alignment, MMOFlags); + + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2, + Store3); + } + } + + // For sizes 49-63 bytes: use 3 x v16i8 (vectors) + overlapping i64 + if (SizeVal >= 49 && SizeVal <= 63) { + if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) && + AlignmentIsAcceptable(MVT::i64, Align(1))) { + uint64_t FourthOffset = SizeVal - 8; + + SDValue Load1 = + DAG.getLoad(MVT::v16i8, dl, Chain, + DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)), + SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags); + + SDValue Load2 = + DAG.getLoad(MVT::v16i8, dl, Chain, + DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(16)), + SrcPtrInfo.getWithOffset(16), Alignment, MMOFlags); + + SDValue Load3 = + DAG.getLoad(MVT::v16i8, dl, Chain, + DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(32)), + SrcPtrInfo.getWithOffset(32), Alignment, MMOFlags); + + SDValue Load4 = DAG.getLoad( + MVT::i64, dl, Chain, + DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(FourthOffset)), + SrcPtrInfo.getWithOffset(FourthOffset), Alignment, MMOFlags); + + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1), + Load2.getValue(1), Load3.getValue(1), + Load4.getValue(1)); + + SDValue Store1 = + DAG.getStore(Chain, dl, Load1, + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)), + DstPtrInfo.getWithOffset(0), Alignment, MMOFlags); + + SDValue Store2 = DAG.getStore( + Chain, dl, Load2, + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(16)), + DstPtrInfo.getWithOffset(16), Alignment, MMOFlags); + + SDValue Store3 = DAG.getStore( + Chain, dl, Load3, + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(32)), + DstPtrInfo.getWithOffset(32), Alignment, MMOFlags); + + SDValue Store4 = DAG.getStore( + Chain, dl, Load4, + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(FourthOffset)), + DstPtrInfo.getWithOffset(FourthOffset), Alignment, MMOFlags); + + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2, + Store3, Store4); + } + } + + // For size 65 bytes: use 4 x v16i8 (vectors) + overlapping i64 + if (SizeVal == 65) { + if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) && + AlignmentIsAcceptable(MVT::i64, Align(1))) { + + SDValue Load1 = + DAG.getLoad(MVT::v16i8, dl, Chain, + DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)), + SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags); + + SDValue Load2 = + DAG.getLoad(MVT::v16i8, dl, Chain, + DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(16)), + SrcPtrInfo.getWithOffset(16), Alignment, MMOFlags); + + SDValue Load3 = + DAG.getLoad(MVT::v16i8, dl, Chain, + DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(32)), + SrcPtrInfo.getWithOffset(32), Alignment, MMOFlags); + + SDValue Load4 = + DAG.getLoad(MVT::v16i8, dl, Chain, + DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(48)), + SrcPtrInfo.getWithOffset(48), Alignment, MMOFlags); + + SDValue Load5 = + DAG.getLoad(MVT::i64, dl, Chain, + DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(57)), + SrcPtrInfo.getWithOffset(57), Alignment, MMOFlags); + + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1), + Load2.getValue(1), Load3.getValue(1), + Load4.getValue(1), Load5.getValue(1)); + + SDValue Store1 = + DAG.getStore(Chain, dl, Load1, + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)), + DstPtrInfo.getWithOffset(0), Alignment, MMOFlags); + + SDValue Store2 = DAG.getStore( + Chain, dl, Load2, + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(16)), + DstPtrInfo.getWithOffset(16), Alignment, MMOFlags); + + SDValue Store3 = DAG.getStore( + Chain, dl, Load3, + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(32)), + DstPtrInfo.getWithOffset(32), Alignment, MMOFlags); + + SDValue Store4 = DAG.getStore( + Chain, dl, Load4, + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(48)), + DstPtrInfo.getWithOffset(48), Alignment, MMOFlags); + + SDValue Store5 = DAG.getStore( + Chain, dl, Load5, + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(57)), + DstPtrInfo.getWithOffset(57), Alignment, MMOFlags); + + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2, + Store3, Store4, Store5); + } + } + } + return SDValue(); } diff --git a/llvm/test/CodeGen/AArch64/memmove-inline.ll b/llvm/test/CodeGen/AArch64/memmove-inline.ll index 641c48dd0f1c5..0ece0feda9da8 100644 --- a/llvm/test/CodeGen/AArch64/memmove-inline.ll +++ b/llvm/test/CodeGen/AArch64/memmove-inline.ll @@ -119,4 +119,102 @@ entry: ret void } +; Test overlapping memmove optimization for non-power-of-two sizes +; These should use overlapping loads/stores instead of mixed-size operations + +define void @move7(ptr %out, ptr %in) { +; CHECK-ALIGNED-LABEL: move7: +; CHECK-ALIGNED: // %bb.0: // %entry +; CHECK-ALIGNED-NEXT: ldur w8, [x1, #3] +; CHECK-ALIGNED-NEXT: ldr w9, [x1] +; CHECK-ALIGNED-NEXT: stur w8, [x0, #3] +; CHECK-ALIGNED-NEXT: str w9, [x0] +; CHECK-ALIGNED-NEXT: ret +entry: + call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 7, i1 false) + ret void +} + +define void @move13(ptr %out, ptr %in) { +; CHECK-ALIGNED-LABEL: move13: +; CHECK-ALIGNED: // %bb.0: // %entry +; CHECK-ALIGNED-NEXT: ldur x8, [x1, #5] +; CHECK-ALIGNED-NEXT: ldr x9, [x1] +; CHECK-ALIGNED-NEXT: stur x8, [x0, #5] +; CHECK-ALIGNED-NEXT: str x9, [x0] +; CHECK-ALIGNED-NEXT: ret +entry: + call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 13, i1 false) + ret void +} + +define void @move15(ptr %out, ptr %in) { +; CHECK-ALIGNED-LABEL: move15: +; CHECK-ALIGNED: // %bb.0: // %entry +; CHECK-ALIGNED-NEXT: ldur x8, [x1, #7] +; CHECK-ALIGNED-NEXT: ldr x9, [x1] +; CHECK-ALIGNED-NEXT: stur x8, [x0, #7] +; CHECK-ALIGNED-NEXT: str x9, [x0] +; CHECK-ALIGNED-NEXT: ret +entry: + call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 15, i1 false) + ret void +} + +define void @move25(ptr %out, ptr %in) { +; CHECK-ALIGNED-LABEL: move25: +; CHECK-ALIGNED: // %bb.0: // %entry +; CHECK-ALIGNED-NEXT: ldur x8, [x1, #17] +; CHECK-ALIGNED-NEXT: ldr q0, [x1] +; CHECK-ALIGNED-NEXT: stur x8, [x0, #17] +; CHECK-ALIGNED-NEXT: str q0, [x0] +; CHECK-ALIGNED-NEXT: ret +entry: + call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 25, i1 false) + ret void +} + +define void @move33(ptr %out, ptr %in) { +; CHECK-ALIGNED-LABEL: move33: +; CHECK-ALIGNED: // %bb.0: // %entry +; CHECK-ALIGNED-NEXT: ldp q1, q0, [x1] +; CHECK-ALIGNED-NEXT: ldur x8, [x1, #25] +; CHECK-ALIGNED-NEXT: stur x8, [x0, #25] +; CHECK-ALIGNED-NEXT: stp q1, q0, [x0] +; CHECK-ALIGNED-NEXT: ret +entry: + call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 33, i1 false) + ret void +} + +define void @move49(ptr %out, ptr %in) { +; CHECK-ALIGNED-LABEL: move49: +; CHECK-ALIGNED: // %bb.0: // %entry +; CHECK-ALIGNED-NEXT: ldp q2, q0, [x1, #16] +; CHECK-ALIGNED-NEXT: ldur x8, [x1, #41] +; CHECK-ALIGNED-NEXT: ldr q1, [x1] +; CHECK-ALIGNED-NEXT: stur x8, [x0, #41] +; CHECK-ALIGNED-NEXT: stp q2, q0, [x0, #16] +; CHECK-ALIGNED-NEXT: str q1, [x0] +; CHECK-ALIGNED-NEXT: ret +entry: + call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 49, i1 false) + ret void +} + +define void @move65(ptr %out, ptr %in) { +; CHECK-ALIGNED-LABEL: move65: +; CHECK-ALIGNED: // %bb.0: // %entry +; CHECK-ALIGNED-NEXT: ldp q0, q1, [x1, #32] +; CHECK-ALIGNED-NEXT: ldur x8, [x1, #57] +; CHECK-ALIGNED-NEXT: ldp q2, q3, [x1] +; CHECK-ALIGNED-NEXT: stur x8, [x0, #57] +; CHECK-ALIGNED-NEXT: stp q0, q1, [x0, #32] +; CHECK-ALIGNED-NEXT: stp q2, q3, [x0] +; CHECK-ALIGNED-NEXT: ret +entry: + call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 65, i1 false) + ret void +} + declare void @llvm.memmove.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1) From 9356a2a35bad18f46368eec42ba0ce1a87beac66 Mon Sep 17 00:00:00 2001 From: Osama Abdelkader Date: Thu, 20 Nov 2025 18:59:35 +0200 Subject: [PATCH 2/2] AArch64: Optimize memmove for non-power-of-two sizes refactoring - Refactored repetitive code - Simplified size check: Using binary arithmetic (Size & (Size - 1)) != 0 - Updated comments Signed-off-by: Osama Abdelkader --- .../Target/AArch64/AArch64ISelLowering.cpp | 14 +- .../AArch64/AArch64SelectionDAGInfo.cpp | 416 +++++------------- 2 files changed, 125 insertions(+), 305 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 2460921050229..d314db2407f14 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -18508,16 +18508,10 @@ bool AArch64TargetLowering::findOptimalMemOpLowering( const AttributeList &FuncAttributes) const { if (!Op.isMemset() && !Op.allowOverlap()) { uint64_t Size = Op.size(); - bool HandledSize = (Size >= 5 && Size <= 7) || - (Size == 9) || - (Size >= 11 && Size <= 15) || - (Size >= 17 && Size <= 23) || - (Size >= 25 && Size <= 31) || - (Size >= 33 && Size <= 47) || - (Size >= 49 && Size <= 63) || - (Size == 65); - - if (HandledSize) { + // Only handle non-power-of-two sizes > 4 and <= 65, excluding size 10 + // which doesn't show improvement. Check if size is non-power-of-two: + // (Size & (Size - 1)) != 0 + if (Size > 4 && Size <= 65 && Size != 10 && (Size & (Size - 1)) != 0) { auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) { if (Op.isAligned(AlignCheck)) return true; diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index 60d20672c46cb..92fccd8fac457 100644 --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -12,6 +12,7 @@ #include "AArch64SelectionDAGInfo.h" #include "AArch64MachineFunctionInfo.h" +#include "llvm/ADT/ArrayRef.h" #define GET_SDNODE_DESC #include "AArch64GenSDNodeInfo.inc" @@ -236,6 +237,44 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( return SDValue(); } +// Helper function to generate overlapping loads/stores for memmove. +// Takes a list of (EVT, offset) pairs for loads/stores and generates the DAG. +static SDValue EmitOverlappingMemmove( + SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, + ArrayRef> LoadOps, Align Alignment, + MachineMemOperand::Flags MMOFlags, MachinePointerInfo DstPtrInfo, + MachinePointerInfo SrcPtrInfo) { + SmallVector Loads; + SmallVector LoadChains; + + // Generate all loads + for (const auto &[VT, Offset] : LoadOps) { + SDValue Load = + DAG.getLoad(VT, dl, Chain, + DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(Offset)), + SrcPtrInfo.getWithOffset(Offset), Alignment, MMOFlags); + Loads.push_back(Load); + LoadChains.push_back(Load.getValue(1)); + } + + // Combine all load chains + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); + + // Generate all stores + SmallVector Stores; + for (size_t i = 0; i < LoadOps.size(); ++i) { + uint64_t Offset = LoadOps[i].second; + SDValue Store = DAG.getStore( + Chain, dl, Loads[i], + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(Offset)), + DstPtrInfo.getWithOffset(Offset), Alignment, MMOFlags); + Stores.push_back(Store); + } + + // Combine all store chains + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); +} + SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemmove( SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVolatile, @@ -256,7 +295,9 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemmove( // Handle small memmove cases with overlapping loads/stores for better codegen // For non-power-of-two sizes, use overlapping operations instead of // mixed-size operations (e.g., for 7 bytes: two i32 loads/stores with overlap - // instead of i32 + i16 + i8) + // instead of i32 + i16 + i8). This optimization provides significant + // improvement for most sizes, though some specific sizes (e.g., 33, 49, 65) + // may show less improvement than others in their range. if (ConstantSDNode *C = dyn_cast(Size)) { uint64_t SizeVal = C->getZExtValue(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -274,306 +315,91 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemmove( MachineMemOperand::Flags MMOFlags = isVolatile ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone; - // For sizes 5-7 bytes: use two overlapping i32 operations - if (SizeVal >= 5 && SizeVal <= 7) { - if (AlignmentIsAcceptable(MVT::i32, Align(1))) { - uint64_t SecondOffset = SizeVal - 4; - - SDValue Load1 = - DAG.getLoad(MVT::i32, dl, Chain, - DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)), - SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags); - - SDValue Load2 = DAG.getLoad( - MVT::i32, dl, Chain, - DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SecondOffset)), - SrcPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags); - - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1), - Load2.getValue(1)); - - SDValue Store1 = - DAG.getStore(Chain, dl, Load1, - DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)), - DstPtrInfo.getWithOffset(0), Alignment, MMOFlags); - - SDValue Store2 = DAG.getStore( - Chain, dl, Load2, - DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(SecondOffset)), - DstPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags); - - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2); + // Only handle non-power-of-two sizes > 4 and <= 65 + // Check if size is non-power-of-two: (Size & (Size - 1)) != 0 + if (SizeVal > 4 && SizeVal <= 65 && (SizeVal & (SizeVal - 1)) != 0) { + SmallVector, 4> LoadOps; + + // For sizes 5-7 bytes: use two overlapping i32 operations + if (SizeVal >= 5 && SizeVal <= 7) { + if (AlignmentIsAcceptable(MVT::i32, Align(1))) { + LoadOps.push_back({MVT::i32, 0}); + LoadOps.push_back({MVT::i32, SizeVal - 4}); + return EmitOverlappingMemmove(DAG, dl, Chain, Dst, Src, LoadOps, + Alignment, MMOFlags, DstPtrInfo, + SrcPtrInfo); + } } - } - - // For sizes 9-15 bytes: use i64 + overlapping i64 - if (SizeVal >= 9 && SizeVal <= 15) { - if (AlignmentIsAcceptable(MVT::i64, Align(1))) { - uint64_t SecondOffset = SizeVal - 8; - - SDValue Load1 = - DAG.getLoad(MVT::i64, dl, Chain, - DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)), - SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags); - - SDValue Load2 = DAG.getLoad( - MVT::i64, dl, Chain, - DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SecondOffset)), - SrcPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags); - - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1), - Load2.getValue(1)); - - SDValue Store1 = - DAG.getStore(Chain, dl, Load1, - DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)), - DstPtrInfo.getWithOffset(0), Alignment, MMOFlags); - - SDValue Store2 = DAG.getStore( - Chain, dl, Load2, - DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(SecondOffset)), - DstPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags); - - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2); + // For sizes 9-15 bytes: use i64 + overlapping i64 + else if (SizeVal >= 9 && SizeVal <= 15) { + if (AlignmentIsAcceptable(MVT::i64, Align(1))) { + LoadOps.push_back({MVT::i64, 0}); + LoadOps.push_back({MVT::i64, SizeVal - 8}); + return EmitOverlappingMemmove(DAG, dl, Chain, Dst, Src, LoadOps, + Alignment, MMOFlags, DstPtrInfo, + SrcPtrInfo); + } } - } - - // For sizes 17-23 bytes: use i64 + i64 + overlapping i64 - if (SizeVal >= 17 && SizeVal <= 23) { - if (AlignmentIsAcceptable(MVT::i64, Align(1))) { - uint64_t ThirdOffset = SizeVal - 8; - - SDValue Load1 = - DAG.getLoad(MVT::i64, dl, Chain, - DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)), - SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags); - - SDValue Load2 = - DAG.getLoad(MVT::i64, dl, Chain, - DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(8)), - SrcPtrInfo.getWithOffset(8), Alignment, MMOFlags); - - SDValue Load3 = DAG.getLoad( - MVT::i64, dl, Chain, - DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(ThirdOffset)), - SrcPtrInfo.getWithOffset(ThirdOffset), Alignment, MMOFlags); - - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1), - Load2.getValue(1), Load3.getValue(1)); - - SDValue Store1 = - DAG.getStore(Chain, dl, Load1, - DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)), - DstPtrInfo.getWithOffset(0), Alignment, MMOFlags); - - SDValue Store2 = - DAG.getStore(Chain, dl, Load2, - DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(8)), - DstPtrInfo.getWithOffset(8), Alignment, MMOFlags); - - SDValue Store3 = DAG.getStore( - Chain, dl, Load3, - DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(ThirdOffset)), - DstPtrInfo.getWithOffset(ThirdOffset), Alignment, MMOFlags); - - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2, - Store3); + // For sizes 17-23 bytes: use i64 + i64 + overlapping i64 + else if (SizeVal >= 17 && SizeVal <= 23) { + if (AlignmentIsAcceptable(MVT::i64, Align(1))) { + LoadOps.push_back({MVT::i64, 0}); + LoadOps.push_back({MVT::i64, 8}); + LoadOps.push_back({MVT::i64, SizeVal - 8}); + return EmitOverlappingMemmove(DAG, dl, Chain, Dst, Src, LoadOps, + Alignment, MMOFlags, DstPtrInfo, + SrcPtrInfo); + } } - } - - // For sizes 25-31 bytes: use v16i8 (vector) + overlapping i64 - if (SizeVal >= 25 && SizeVal <= 31) { - if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) && - AlignmentIsAcceptable(MVT::i64, Align(1))) { - uint64_t SecondOffset = SizeVal - 8; - - SDValue Load1 = - DAG.getLoad(MVT::v16i8, dl, Chain, - DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)), - SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags); - - SDValue Load2 = DAG.getLoad( - MVT::i64, dl, Chain, - DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SecondOffset)), - SrcPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags); - - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1), - Load2.getValue(1)); - - SDValue Store1 = - DAG.getStore(Chain, dl, Load1, - DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)), - DstPtrInfo.getWithOffset(0), Alignment, MMOFlags); - - SDValue Store2 = DAG.getStore( - Chain, dl, Load2, - DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(SecondOffset)), - DstPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags); - - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2); + // For sizes 25-31 bytes: use v16i8 (vector) + overlapping i64 + else if (SizeVal >= 25 && SizeVal <= 31) { + if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) && + AlignmentIsAcceptable(MVT::i64, Align(1))) { + LoadOps.push_back({MVT::v16i8, 0}); + LoadOps.push_back({MVT::i64, SizeVal - 8}); + return EmitOverlappingMemmove(DAG, dl, Chain, Dst, Src, LoadOps, + Alignment, MMOFlags, DstPtrInfo, + SrcPtrInfo); + } } - } - - // For sizes 33-47 bytes: use 2 x v16i8 (vectors) + overlapping i64 - if (SizeVal >= 33 && SizeVal <= 47) { - if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) && - AlignmentIsAcceptable(MVT::i64, Align(1))) { - uint64_t ThirdOffset = SizeVal - 8; - - SDValue Load1 = - DAG.getLoad(MVT::v16i8, dl, Chain, - DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)), - SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags); - - SDValue Load2 = - DAG.getLoad(MVT::v16i8, dl, Chain, - DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(16)), - SrcPtrInfo.getWithOffset(16), Alignment, MMOFlags); - - SDValue Load3 = DAG.getLoad( - MVT::i64, dl, Chain, - DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(ThirdOffset)), - SrcPtrInfo.getWithOffset(ThirdOffset), Alignment, MMOFlags); - - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1), - Load2.getValue(1), Load3.getValue(1)); - - SDValue Store1 = - DAG.getStore(Chain, dl, Load1, - DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)), - DstPtrInfo.getWithOffset(0), Alignment, MMOFlags); - - SDValue Store2 = DAG.getStore( - Chain, dl, Load2, - DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(16)), - DstPtrInfo.getWithOffset(16), Alignment, MMOFlags); - - SDValue Store3 = DAG.getStore( - Chain, dl, Load3, - DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(ThirdOffset)), - DstPtrInfo.getWithOffset(ThirdOffset), Alignment, MMOFlags); - - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2, - Store3); + // For sizes 33-47 bytes: use 2 x v16i8 (vectors) + overlapping i64 + else if (SizeVal >= 33 && SizeVal <= 47) { + if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) && + AlignmentIsAcceptable(MVT::i64, Align(1))) { + LoadOps.push_back({MVT::v16i8, 0}); + LoadOps.push_back({MVT::v16i8, 16}); + LoadOps.push_back({MVT::i64, SizeVal - 8}); + return EmitOverlappingMemmove(DAG, dl, Chain, Dst, Src, LoadOps, + Alignment, MMOFlags, DstPtrInfo, + SrcPtrInfo); + } } - } - - // For sizes 49-63 bytes: use 3 x v16i8 (vectors) + overlapping i64 - if (SizeVal >= 49 && SizeVal <= 63) { - if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) && - AlignmentIsAcceptable(MVT::i64, Align(1))) { - uint64_t FourthOffset = SizeVal - 8; - - SDValue Load1 = - DAG.getLoad(MVT::v16i8, dl, Chain, - DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)), - SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags); - - SDValue Load2 = - DAG.getLoad(MVT::v16i8, dl, Chain, - DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(16)), - SrcPtrInfo.getWithOffset(16), Alignment, MMOFlags); - - SDValue Load3 = - DAG.getLoad(MVT::v16i8, dl, Chain, - DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(32)), - SrcPtrInfo.getWithOffset(32), Alignment, MMOFlags); - - SDValue Load4 = DAG.getLoad( - MVT::i64, dl, Chain, - DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(FourthOffset)), - SrcPtrInfo.getWithOffset(FourthOffset), Alignment, MMOFlags); - - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1), - Load2.getValue(1), Load3.getValue(1), - Load4.getValue(1)); - - SDValue Store1 = - DAG.getStore(Chain, dl, Load1, - DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)), - DstPtrInfo.getWithOffset(0), Alignment, MMOFlags); - - SDValue Store2 = DAG.getStore( - Chain, dl, Load2, - DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(16)), - DstPtrInfo.getWithOffset(16), Alignment, MMOFlags); - - SDValue Store3 = DAG.getStore( - Chain, dl, Load3, - DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(32)), - DstPtrInfo.getWithOffset(32), Alignment, MMOFlags); - - SDValue Store4 = DAG.getStore( - Chain, dl, Load4, - DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(FourthOffset)), - DstPtrInfo.getWithOffset(FourthOffset), Alignment, MMOFlags); - - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2, - Store3, Store4); + // For sizes 49-63 bytes: use 3 x v16i8 (vectors) + overlapping i64 + else if (SizeVal >= 49 && SizeVal <= 63) { + if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) && + AlignmentIsAcceptable(MVT::i64, Align(1))) { + LoadOps.push_back({MVT::v16i8, 0}); + LoadOps.push_back({MVT::v16i8, 16}); + LoadOps.push_back({MVT::v16i8, 32}); + LoadOps.push_back({MVT::i64, SizeVal - 8}); + return EmitOverlappingMemmove(DAG, dl, Chain, Dst, Src, LoadOps, + Alignment, MMOFlags, DstPtrInfo, + SrcPtrInfo); + } } - } - - // For size 65 bytes: use 4 x v16i8 (vectors) + overlapping i64 - if (SizeVal == 65) { - if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) && - AlignmentIsAcceptable(MVT::i64, Align(1))) { - - SDValue Load1 = - DAG.getLoad(MVT::v16i8, dl, Chain, - DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)), - SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags); - - SDValue Load2 = - DAG.getLoad(MVT::v16i8, dl, Chain, - DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(16)), - SrcPtrInfo.getWithOffset(16), Alignment, MMOFlags); - - SDValue Load3 = - DAG.getLoad(MVT::v16i8, dl, Chain, - DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(32)), - SrcPtrInfo.getWithOffset(32), Alignment, MMOFlags); - - SDValue Load4 = - DAG.getLoad(MVT::v16i8, dl, Chain, - DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(48)), - SrcPtrInfo.getWithOffset(48), Alignment, MMOFlags); - - SDValue Load5 = - DAG.getLoad(MVT::i64, dl, Chain, - DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(57)), - SrcPtrInfo.getWithOffset(57), Alignment, MMOFlags); - - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1), - Load2.getValue(1), Load3.getValue(1), - Load4.getValue(1), Load5.getValue(1)); - - SDValue Store1 = - DAG.getStore(Chain, dl, Load1, - DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)), - DstPtrInfo.getWithOffset(0), Alignment, MMOFlags); - - SDValue Store2 = DAG.getStore( - Chain, dl, Load2, - DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(16)), - DstPtrInfo.getWithOffset(16), Alignment, MMOFlags); - - SDValue Store3 = DAG.getStore( - Chain, dl, Load3, - DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(32)), - DstPtrInfo.getWithOffset(32), Alignment, MMOFlags); - - SDValue Store4 = DAG.getStore( - Chain, dl, Load4, - DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(48)), - DstPtrInfo.getWithOffset(48), Alignment, MMOFlags); - - SDValue Store5 = DAG.getStore( - Chain, dl, Load5, - DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(57)), - DstPtrInfo.getWithOffset(57), Alignment, MMOFlags); - - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2, - Store3, Store4, Store5); + // For size 65 bytes: use 4 x v16i8 (vectors) + overlapping i64 + else if (SizeVal == 65) { + if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) && + AlignmentIsAcceptable(MVT::i64, Align(1))) { + LoadOps.push_back({MVT::v16i8, 0}); + LoadOps.push_back({MVT::v16i8, 16}); + LoadOps.push_back({MVT::v16i8, 32}); + LoadOps.push_back({MVT::v16i8, 48}); + LoadOps.push_back({MVT::i64, SizeVal - 8}); + return EmitOverlappingMemmove(DAG, dl, Chain, Dst, Src, LoadOps, + Alignment, MMOFlags, DstPtrInfo, + SrcPtrInfo); + } } } }