Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18502,6 +18502,46 @@ LLT AArch64TargetLowering::getOptimalMemOpLLT(
return LLT();
}

bool AArch64TargetLowering::findOptimalMemOpLowering(
LLVMContext &Context, std::vector<EVT> &MemOps, unsigned Limit,
const MemOp &Op, unsigned DstAS, unsigned SrcAS,
const AttributeList &FuncAttributes) const {
if (!Op.isMemset() && !Op.allowOverlap()) {
uint64_t Size = Op.size();
// Only handle non-power-of-two sizes > 4 and <= 65, excluding size 10
// which doesn't show improvement. Check if size is non-power-of-two:
// (Size & (Size - 1)) != 0
if (Size > 4 && Size <= 65 && Size != 10 && (Size & (Size - 1)) != 0) {
auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
if (Op.isAligned(AlignCheck))
return true;
unsigned Fast;
return allowsMisalignedMemoryAccesses(
VT, DstAS, Align(1), MachineMemOperand::MONone, &Fast) &&
Fast;
};

// Check if we can use the appropriate type for this size range
bool CanHandle = false;
if (Size >= 5 && Size <= 7) {
CanHandle = AlignmentIsAcceptable(MVT::i32, Align(1));
} else if (Size >= 9 && Size <= 23) {
CanHandle = AlignmentIsAcceptable(MVT::i64, Align(1));
} else if (Size >= 25 && Size <= 65) {
CanHandle = AlignmentIsAcceptable(MVT::v16i8, Align(1)) &&
AlignmentIsAcceptable(MVT::i64, Align(1));
}

if (CanHandle)
return false;
}
}

// Otherwise, use the default implementation
return TargetLowering::findOptimalMemOpLowering(Context, MemOps, Limit, Op,
DstAS, SrcAS, FuncAttributes);
}

// 12-bit optionally shifted immediates are legal for adds.
bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
if (Immed == std::numeric_limits<int64_t>::min()) {
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,12 @@ class AArch64TargetLowering : public TargetLowering {
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op,
const AttributeList &FuncAttributes) const override;

bool
findOptimalMemOpLowering(LLVMContext &Context, std::vector<EVT> &MemOps,
unsigned Limit, const MemOp &Op, unsigned DstAS,
unsigned SrcAS,
const AttributeList &FuncAttributes) const override;

LLT getOptimalMemOpLLT(const MemOp &Op,
const AttributeList &FuncAttributes) const override;

Expand Down
152 changes: 152 additions & 0 deletions llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#include "AArch64SelectionDAGInfo.h"
#include "AArch64MachineFunctionInfo.h"
#include "llvm/ADT/ArrayRef.h"

#define GET_SDNODE_DESC
#include "AArch64GenSDNodeInfo.inc"
Expand Down Expand Up @@ -236,6 +237,44 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
return SDValue();
}

// Helper function to generate overlapping loads/stores for memmove.
// Takes a list of (EVT, offset) pairs for loads/stores and generates the DAG.
static SDValue EmitOverlappingMemmove(
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
ArrayRef<std::pair<EVT, uint64_t>> LoadOps, Align Alignment,
MachineMemOperand::Flags MMOFlags, MachinePointerInfo DstPtrInfo,
MachinePointerInfo SrcPtrInfo) {
SmallVector<SDValue, 8> Loads;
SmallVector<SDValue, 8> LoadChains;

// Generate all loads
for (const auto &[VT, Offset] : LoadOps) {
SDValue Load =
DAG.getLoad(VT, dl, Chain,
DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(Offset)),
SrcPtrInfo.getWithOffset(Offset), Alignment, MMOFlags);
Loads.push_back(Load);
LoadChains.push_back(Load.getValue(1));
}

// Combine all load chains
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);

// Generate all stores
SmallVector<SDValue, 8> Stores;
for (size_t i = 0; i < LoadOps.size(); ++i) {
uint64_t Offset = LoadOps[i].second;
SDValue Store = DAG.getStore(
Chain, dl, Loads[i],
DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(Offset)),
DstPtrInfo.getWithOffset(Offset), Alignment, MMOFlags);
Stores.push_back(Store);
}

// Combine all store chains
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
}

SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemmove(
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
SDValue Size, Align Alignment, bool isVolatile,
Expand All @@ -252,6 +291,119 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemmove(
if (LowerToSMERoutines && !Attrs.hasNonStreamingInterfaceAndBody())
return EmitStreamingCompatibleMemLibCall(DAG, dl, Chain, Dst, Src, Size,
RTLIB::MEMMOVE);

// Handle small memmove cases with overlapping loads/stores for better codegen
// For non-power-of-two sizes, use overlapping operations instead of
// mixed-size operations (e.g., for 7 bytes: two i32 loads/stores with overlap
// instead of i32 + i16 + i8). This optimization provides significant
// improvement for most sizes, though some specific sizes (e.g., 33, 49, 65)
// may show less improvement than others in their range.
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Size)) {
uint64_t SizeVal = C->getZExtValue();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();

auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
if (Alignment >= AlignCheck)
return true;
unsigned Fast;
return TLI.allowsMisalignedMemoryAccesses(
VT, DstPtrInfo.getAddrSpace(), Align(1),
MachineMemOperand::MONone, &Fast) &&
Fast;
};

MachineMemOperand::Flags MMOFlags =
isVolatile ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;

// Only handle non-power-of-two sizes > 4 and <= 65
// Check if size is non-power-of-two: (Size & (Size - 1)) != 0
if (SizeVal > 4 && SizeVal <= 65 && (SizeVal & (SizeVal - 1)) != 0) {
SmallVector<std::pair<EVT, uint64_t>, 4> LoadOps;

// For sizes 5-7 bytes: use two overlapping i32 operations
if (SizeVal >= 5 && SizeVal <= 7) {
if (AlignmentIsAcceptable(MVT::i32, Align(1))) {
LoadOps.push_back({MVT::i32, 0});
LoadOps.push_back({MVT::i32, SizeVal - 4});
return EmitOverlappingMemmove(DAG, dl, Chain, Dst, Src, LoadOps,
Alignment, MMOFlags, DstPtrInfo,
SrcPtrInfo);
}
}
// For sizes 9-15 bytes: use i64 + overlapping i64
else if (SizeVal >= 9 && SizeVal <= 15) {
if (AlignmentIsAcceptable(MVT::i64, Align(1))) {
LoadOps.push_back({MVT::i64, 0});
LoadOps.push_back({MVT::i64, SizeVal - 8});
return EmitOverlappingMemmove(DAG, dl, Chain, Dst, Src, LoadOps,
Alignment, MMOFlags, DstPtrInfo,
SrcPtrInfo);
}
}
// For sizes 17-23 bytes: use i64 + i64 + overlapping i64
else if (SizeVal >= 17 && SizeVal <= 23) {
if (AlignmentIsAcceptable(MVT::i64, Align(1))) {
LoadOps.push_back({MVT::i64, 0});
LoadOps.push_back({MVT::i64, 8});
LoadOps.push_back({MVT::i64, SizeVal - 8});
return EmitOverlappingMemmove(DAG, dl, Chain, Dst, Src, LoadOps,
Alignment, MMOFlags, DstPtrInfo,
SrcPtrInfo);
}
}
// For sizes 25-31 bytes: use v16i8 (vector) + overlapping i64
else if (SizeVal >= 25 && SizeVal <= 31) {
if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) &&
AlignmentIsAcceptable(MVT::i64, Align(1))) {
LoadOps.push_back({MVT::v16i8, 0});
LoadOps.push_back({MVT::i64, SizeVal - 8});
return EmitOverlappingMemmove(DAG, dl, Chain, Dst, Src, LoadOps,
Alignment, MMOFlags, DstPtrInfo,
SrcPtrInfo);
}
}
// For sizes 33-47 bytes: use 2 x v16i8 (vectors) + overlapping i64
else if (SizeVal >= 33 && SizeVal <= 47) {
if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) &&
AlignmentIsAcceptable(MVT::i64, Align(1))) {
LoadOps.push_back({MVT::v16i8, 0});
LoadOps.push_back({MVT::v16i8, 16});
LoadOps.push_back({MVT::i64, SizeVal - 8});
return EmitOverlappingMemmove(DAG, dl, Chain, Dst, Src, LoadOps,
Alignment, MMOFlags, DstPtrInfo,
SrcPtrInfo);
}
}
// For sizes 49-63 bytes: use 3 x v16i8 (vectors) + overlapping i64
else if (SizeVal >= 49 && SizeVal <= 63) {
if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) &&
AlignmentIsAcceptable(MVT::i64, Align(1))) {
LoadOps.push_back({MVT::v16i8, 0});
LoadOps.push_back({MVT::v16i8, 16});
LoadOps.push_back({MVT::v16i8, 32});
LoadOps.push_back({MVT::i64, SizeVal - 8});
return EmitOverlappingMemmove(DAG, dl, Chain, Dst, Src, LoadOps,
Alignment, MMOFlags, DstPtrInfo,
SrcPtrInfo);
}
}
// For size 65 bytes: use 4 x v16i8 (vectors) + overlapping i64
else if (SizeVal == 65) {
if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) &&
AlignmentIsAcceptable(MVT::i64, Align(1))) {
LoadOps.push_back({MVT::v16i8, 0});
LoadOps.push_back({MVT::v16i8, 16});
LoadOps.push_back({MVT::v16i8, 32});
LoadOps.push_back({MVT::v16i8, 48});
LoadOps.push_back({MVT::i64, SizeVal - 8});
return EmitOverlappingMemmove(DAG, dl, Chain, Dst, Src, LoadOps,
Alignment, MMOFlags, DstPtrInfo,
SrcPtrInfo);
}
}
}
}

return SDValue();
}

Expand Down
98 changes: 98 additions & 0 deletions llvm/test/CodeGen/AArch64/memmove-inline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -119,4 +119,102 @@ entry:
ret void
}

; Test overlapping memmove optimization for non-power-of-two sizes
; These should use overlapping loads/stores instead of mixed-size operations

define void @move7(ptr %out, ptr %in) {
; CHECK-ALIGNED-LABEL: move7:
; CHECK-ALIGNED: // %bb.0: // %entry
; CHECK-ALIGNED-NEXT: ldur w8, [x1, #3]
; CHECK-ALIGNED-NEXT: ldr w9, [x1]
; CHECK-ALIGNED-NEXT: stur w8, [x0, #3]
; CHECK-ALIGNED-NEXT: str w9, [x0]
; CHECK-ALIGNED-NEXT: ret
entry:
call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 7, i1 false)
ret void
}

define void @move13(ptr %out, ptr %in) {
; CHECK-ALIGNED-LABEL: move13:
; CHECK-ALIGNED: // %bb.0: // %entry
; CHECK-ALIGNED-NEXT: ldur x8, [x1, #5]
; CHECK-ALIGNED-NEXT: ldr x9, [x1]
; CHECK-ALIGNED-NEXT: stur x8, [x0, #5]
; CHECK-ALIGNED-NEXT: str x9, [x0]
; CHECK-ALIGNED-NEXT: ret
entry:
call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 13, i1 false)
ret void
}

define void @move15(ptr %out, ptr %in) {
; CHECK-ALIGNED-LABEL: move15:
; CHECK-ALIGNED: // %bb.0: // %entry
; CHECK-ALIGNED-NEXT: ldur x8, [x1, #7]
; CHECK-ALIGNED-NEXT: ldr x9, [x1]
; CHECK-ALIGNED-NEXT: stur x8, [x0, #7]
; CHECK-ALIGNED-NEXT: str x9, [x0]
; CHECK-ALIGNED-NEXT: ret
entry:
call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 15, i1 false)
ret void
}

define void @move25(ptr %out, ptr %in) {
; CHECK-ALIGNED-LABEL: move25:
; CHECK-ALIGNED: // %bb.0: // %entry
; CHECK-ALIGNED-NEXT: ldur x8, [x1, #17]
; CHECK-ALIGNED-NEXT: ldr q0, [x1]
; CHECK-ALIGNED-NEXT: stur x8, [x0, #17]
; CHECK-ALIGNED-NEXT: str q0, [x0]
; CHECK-ALIGNED-NEXT: ret
entry:
call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 25, i1 false)
ret void
}

define void @move33(ptr %out, ptr %in) {
; CHECK-ALIGNED-LABEL: move33:
; CHECK-ALIGNED: // %bb.0: // %entry
; CHECK-ALIGNED-NEXT: ldp q1, q0, [x1]
; CHECK-ALIGNED-NEXT: ldur x8, [x1, #25]
; CHECK-ALIGNED-NEXT: stur x8, [x0, #25]
; CHECK-ALIGNED-NEXT: stp q1, q0, [x0]
; CHECK-ALIGNED-NEXT: ret
entry:
call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 33, i1 false)
ret void
}

define void @move49(ptr %out, ptr %in) {
; CHECK-ALIGNED-LABEL: move49:
; CHECK-ALIGNED: // %bb.0: // %entry
; CHECK-ALIGNED-NEXT: ldp q2, q0, [x1, #16]
; CHECK-ALIGNED-NEXT: ldur x8, [x1, #41]
; CHECK-ALIGNED-NEXT: ldr q1, [x1]
; CHECK-ALIGNED-NEXT: stur x8, [x0, #41]
; CHECK-ALIGNED-NEXT: stp q2, q0, [x0, #16]
; CHECK-ALIGNED-NEXT: str q1, [x0]
; CHECK-ALIGNED-NEXT: ret
entry:
call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 49, i1 false)
ret void
}

define void @move65(ptr %out, ptr %in) {
; CHECK-ALIGNED-LABEL: move65:
; CHECK-ALIGNED: // %bb.0: // %entry
; CHECK-ALIGNED-NEXT: ldp q0, q1, [x1, #32]
; CHECK-ALIGNED-NEXT: ldur x8, [x1, #57]
; CHECK-ALIGNED-NEXT: ldp q2, q3, [x1]
; CHECK-ALIGNED-NEXT: stur x8, [x0, #57]
; CHECK-ALIGNED-NEXT: stp q0, q1, [x0, #32]
; CHECK-ALIGNED-NEXT: stp q2, q3, [x0]
; CHECK-ALIGNED-NEXT: ret
entry:
call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 65, i1 false)
ret void
}

declare void @llvm.memmove.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1)