Skip to content

Commit

Permalink
[X86] For minsize, use size for alignment, rather than actual alignment
Browse files Browse the repository at this point in the history
If we have minsize, then don't care about the alignment.
On x86, the CPU doesn't care and neither should you.

As long as the count is aligned, we can use less instructions.
  • Loading branch information
AreaZR committed Apr 6, 2024
1 parent b161d4f commit 5e36b38
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 300 deletions.
109 changes: 70 additions & 39 deletions llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,23 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
// If not DWORD aligned or size is more than the threshold, call the library.
// The libc version is likely to be faster for these cases. It can use the
// address value and run time information about the CPU.
if (Alignment < Align(4) || !ConstantSize ||
ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold())
if (!ConstantSize)
return SDValue();

uint64_t SizeVal = ConstantSize->getZExtValue();
if (!AlwaysInline &&
(Alignment < Align(4) || SizeVal > Subtarget.getMaxInlineSizeThreshold()))
return SDValue();

// If we have minsize, then don't care about the alignment.
// On x86, the CPU doesn't care and neither should you.
// As long as the count is aligned, we can use the minimum number of
// instructions without always having to resort to stosb.
//
// Because this is a feature specific to x86, we must handle it here.
if (DAG.getMachineFunction().getFunction().hasMinSize())
commonAlignment(Align(Subtarget.is64Bit() ? 8 : 4), SizeVal);

SDValue InGlue;
EVT AVT;
SDValue Count;
Expand All @@ -80,13 +92,13 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
uint64_t Val = ValC->getZExtValue() & 255;

// If the value is a constant, then we can potentially use larger sets.
if (Alignment > Align(2)) {
if (Alignment >= Align(4)) {
// DWORD aligned
AVT = MVT::i32;
ValReg = X86::EAX;
Val = (Val << 8) | Val;
Val = (Val << 16) | Val;
if (Subtarget.is64Bit() && Alignment > Align(8)) { // QWORD aligned
if (Subtarget.is64Bit() && Alignment >= Align(8)) { // QWORD aligned
AVT = MVT::i64;
ValReg = X86::RAX;
Val = (Val << 32) | Val;
Expand All @@ -103,12 +115,10 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
Count = DAG.getIntPtrConstant(SizeVal, dl);
}

if (AVT.bitsGT(MVT::i8)) {
unsigned UBytes = AVT.getSizeInBits() / 8;
Count = DAG.getIntPtrConstant(SizeVal / UBytes, dl);
BytesLeft = SizeVal % UBytes;
}

const uint64_t BlockBytes = AVT.getSizeInBits() / 8;
const uint64_t BlockCount = SizeVal / BlockBytes;
Count = DAG.getIntPtrConstant(BlockCount, dl);
BytesLeft = SizeVal % BlockBytes;
Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT),
InGlue);
InGlue = Chain.getValue(1);
Expand All @@ -120,34 +130,41 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
}

bool Use64BitRegs = Subtarget.isTarget64BitLP64();
Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX,
Count, InGlue);
Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX, Count,
InGlue);
InGlue = Chain.getValue(1);
Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI,
Dst, InGlue);
Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI, Dst,
InGlue);
InGlue = Chain.getValue(1);

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue Ops[] = { Chain, DAG.getValueType(AVT), InGlue };
Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);

if (BytesLeft) {
// Handle the last 1 - 7 bytes.
unsigned Offset = SizeVal - BytesLeft;
EVT AddrVT = Dst.getValueType();
EVT SizeVT = Size.getValueType();

Chain =
DAG.getMemset(Chain, dl,
DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
DAG.getConstant(Offset, dl, AddrVT)),
Val, DAG.getConstant(BytesLeft, dl, SizeVT), Alignment,
isVolatile, AlwaysInline,
/* isTailCall */ false, DstPtrInfo.getWithOffset(Offset));
}
SDValue Ops[] = {Chain, DAG.getValueType(AVT), InGlue};
SDValue RepStos = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);

/// RepStos can process the whole length.
//
// Because we changed the alignment earlier in the function to work on size
// when we have the minsize attribute, this is guaranteed to be 0 when we get
// here.
if (BytesLeft == 0)
return RepStos;

// TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
return Chain;
// Handle the last 1 - 7 bytes.
SmallVector<SDValue, 4> Results;
Results.push_back(RepStos);
unsigned Offset = SizeVal - BytesLeft;
EVT AddrVT = Dst.getValueType();
EVT SizeVT = Size.getValueType();

Results.push_back(
DAG.getMemset(Chain, dl,
DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
DAG.getConstant(Offset, dl, AddrVT)),
Val, DAG.getConstant(BytesLeft, dl, SizeVT), Alignment,
isVolatile, /* AlwaysInline */ true,
/* isTailCall */ false, DstPtrInfo.getWithOffset(Offset)));

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
}

/// Emit a single REP MOVS{B,W,D,Q} instruction.
Expand Down Expand Up @@ -220,13 +237,32 @@ static SDValue emitConstantSizeRepmov(
assert(!Subtarget.hasERMSB() && "No efficient RepMovs");
/// We assume runtime memcpy will do a better job for unaligned copies when
/// ERMS is not present.
if (!AlwaysInline && (Alignment.value() & 3) != 0)
if (!AlwaysInline && (Alignment < Align(4)))
return SDValue();

// If we have minsize, then don't care about the alignment.
// On x86, the CPU doesn't care and neither should you.
// As long as the count is aligned, we can use the minimum number of
// instructions without always having to resort to movsb
//
// Because this is a feature specific to x86, we must handle it here.

if (DAG.getMachineFunction().getFunction().hasMinSize())
commonAlignment(Align(Subtarget.is64Bit() ? 8 : 4), Size);

const MVT BlockType = getOptimalRepmovsType(Subtarget, Alignment);
const uint64_t BlockBytes = BlockType.getSizeInBits() / 8;
const uint64_t BlockCount = Size / BlockBytes;
const uint64_t BytesLeft = Size % BlockBytes;

if (DAG.getMachineFunction().getFunction().hasMinSize()) {
// Use the one instruction determined. Because we changed the alignment
// earlier in the function to work on size when we have the minsize
// attribute, it is guaranteed to process the entire length.
return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
DAG.getIntPtrConstant(BlockCount, dl), BlockType);
}

SDValue RepMovs =
emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
DAG.getIntPtrConstant(BlockCount, dl), BlockType);
Expand All @@ -237,11 +273,6 @@ static SDValue emitConstantSizeRepmov(

assert(BytesLeft && "We have leftover at this point");

/// In case we optimize for size we use repmovsb even if it's less efficient
/// so we can save the loads/stores of the leftover.
if (DAG.getMachineFunction().getFunction().hasMinSize())
return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);

// Handle the last 1 - 7 bytes.
SmallVector<SDValue, 4> Results;
Results.push_back(RepMovs);
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/memset-minsize.ll
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ entry:
define void @small_memset_to_rep_stos(ptr %ptr) minsize nounwind {
; CHECK-LABEL: small_memset_to_rep_stos:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq $32
; CHECK-NEXT: pushq $16
; CHECK-NEXT: popq %rcx
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: rep;stosl %eax, %es:(%rdi)
; CHECK-NEXT: rep;stosq %rax, %es:(%rdi)
; CHECK-NEXT: retq
entry:
call void @llvm.memset.p0.i32(ptr align 4 %ptr, i8 0, i32 128, i1 false)
Expand Down
Loading

0 comments on commit 5e36b38

Please sign in to comment.