Skip to content

Commit

Permalink
[CGP / PowerPC] avoid multi-block overhead for simple memcmp expansion
Browse files Browse the repository at this point in the history
The test diff for PowerPC shows we can better optimize if this case is one block.

For x86, there's would be a substantial difference if CGP expansion was enabled because branches are assumed 
cheap and SDAG can't optimize across blocks. 

Instead of this:

_cmp_eq8:
  movq  (%rdi), %rax
  cmpq  (%rsi), %rax
  je  LBB23_1
## BB#2:                                ## %res_block
  movl  $1, %ecx
  jmp LBB23_3
LBB23_1:
  xorl  %ecx, %ecx
LBB23_3:                                ## %endblock
  xorl  %eax, %eax
  testl %ecx, %ecx
  sete  %al
  retq

We get this:

cmp_eq8:   
  movq  (%rdi), %rcx
  xorl  %eax, %eax
  cmpq  (%rsi), %rcx
  sete  %al
  retq

And that matches the optimal codegen that we get from the current expansion in SelectionDAGBuilder::visitMemCmpCall(). 
If this looks right, then I just need to confirm that vector-sized expansion will work from here, and we can enable 
CGP memcmp() expansion for x86. Ie, we'll bypass the power-of-2 special cases currently optimized in SDAG because we 
can lower the IR produced here optimally.

Differential Revision: https://reviews.llvm.org/D34005

llvm-svn: 304987
  • Loading branch information
rotateright committed Jun 8, 2017
1 parent 8cb1d09 commit e7c5041
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 26 deletions.
63 changes: 42 additions & 21 deletions llvm/lib/CodeGen/CodeGenPrepare.cpp
Expand Up @@ -1675,6 +1675,7 @@ class MemCmpExpansion {
void emitLoadCompareByteBlock(unsigned Index, int GEPIndex);
void emitMemCmpResultBlock(bool IsLittleEndian);
Value *getMemCmpExpansionZeroCase(unsigned Size, bool IsLittleEndian);
Value *getMemCmpEqZeroOneBlock(unsigned Size);
unsigned getLoadSize(unsigned Size);
unsigned getNumLoads(unsigned Size);

Expand All @@ -1699,31 +1700,35 @@ MemCmpExpansion::MemCmpExpansion(CallInst *CI, uint64_t Size,
unsigned MaxLoadSize, unsigned LoadsPerBlock)
: CI(CI), MaxLoadSize(MaxLoadSize), NumLoadsPerBlock(LoadsPerBlock) {

IRBuilder<> Builder(CI->getContext());
BasicBlock *StartBlock = CI->getParent();
EndBlock = StartBlock->splitBasicBlock(CI, "endblock");
setupEndBlockPHINodes();
// A memcmp with zero-comparison with only one block of load and compare does
// not need to set up any extra blocks. This case could be handled in the DAG,
// but since we have all of the machinery to flexibly expand any memcpy here,
// we choose to handle this case too to avoid fragmented lowering.
IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);

// Calculate how many load compare blocks are required for an expansion of
// given Size.
NumBlocks = calculateNumBlocks(Size);
createResultBlock();
if (!IsUsedForZeroCmp || NumBlocks != 1) {
BasicBlock *StartBlock = CI->getParent();
EndBlock = StartBlock->splitBasicBlock(CI, "endblock");
setupEndBlockPHINodes();
createResultBlock();

// If return value of memcmp is not used in a zero equality, we need to
// calculate which source was larger. The calculation requires the
// two loaded source values of each load compare block.
// These will be saved in the phi nodes created by setupResultBlockPHINodes.
if (!IsUsedForZeroCmp)
setupResultBlockPHINodes();

// If return value of memcmp is not used in a zero equality, we need to
// calculate which source was larger. The calculation requires the
// two loaded source values of each load compare block.
// These will be saved in the phi nodes created by setupResultBlockPHINodes.
if (!IsUsedForZeroCmp)
setupResultBlockPHINodes();
// Create the number of required load compare basic blocks.
createLoadCmpBlocks();

// Create the number of required load compare basic blocks.
createLoadCmpBlocks();
// Update the terminator added by splitBasicBlock to branch to the first
// LoadCmpBlock.
StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]);
}

// Update the terminator added by splitBasicBlock to branch to the first
// LoadCmpBlock.
IRBuilder<> Builder(CI->getContext());
Builder.SetCurrentDebugLocation(CI->getDebugLoc());
StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]);
}

void MemCmpExpansion::createLoadCmpBlocks() {
Expand Down Expand Up @@ -1810,7 +1815,12 @@ Value *MemCmpExpansion::getCompareLoadPairs(unsigned Index, unsigned Size,
unsigned NumLoadsRemaining = getNumLoads(RemainingBytes);
unsigned NumLoads = std::min(NumLoadsRemaining, NumLoadsPerBlock);

Builder.SetInsertPoint(LoadCmpBlocks[Index]);
// For a single-block expansion, start inserting before the memcmp call.
if (LoadCmpBlocks.empty())
Builder.SetInsertPoint(CI);
else
Builder.SetInsertPoint(LoadCmpBlocks[Index]);

Value *Cmp = nullptr;
for (unsigned i = 0; i < NumLoads; ++i) {
unsigned LoadSize = getLoadSize(RemainingBytes);
Expand Down Expand Up @@ -2071,11 +2081,22 @@ Value *MemCmpExpansion::getMemCmpExpansionZeroCase(unsigned Size,
return PhiRes;
}

/// A memcmp expansion that compares equality with 0 and only has one block of
/// load and compare can bypass the compare, branch, and phi IR that is required
/// in the general case.
Value *MemCmpExpansion::getMemCmpEqZeroOneBlock(unsigned Size) {
unsigned NumBytesProcessed = 0;
IRBuilder<> Builder(CI->getContext());
Value *Cmp = getCompareLoadPairs(0, Size, NumBytesProcessed, Builder);
return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext()));
}

// This function expands the memcmp call into an inline expansion and returns
// the memcmp result.
Value *MemCmpExpansion::getMemCmpExpansion(uint64_t Size, bool IsLittleEndian) {
if (IsUsedForZeroCmp)
return getMemCmpExpansionZeroCase(Size, IsLittleEndian);
return NumBlocks == 1 ? getMemCmpEqZeroOneBlock(Size) :
getMemCmpExpansionZeroCase(Size, IsLittleEndian);

// This loop calls emitLoadCompareBlock for comparing Size bytes of the two
// memcmp sources. It starts with loading using the maximum load size set by
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
Expand Up @@ -17,13 +17,13 @@ declare signext i32 @memcmp(i8* nocapture, i8* nocapture, i64) local_unnamed_add
; Check 4 bytes - requires 1 load for each param.
define signext i32 @zeroEqualityTest02(i8* %x, i8* %y) {
; CHECK-LABEL: zeroEqualityTest02:
; CHECK: # BB#0: # %loadbb
; CHECK: # BB#0:
; CHECK-NEXT: lwz 3, 0(3)
; CHECK-NEXT: lwz 4, 0(4)
; CHECK-NEXT: li 5, 1
; CHECK-NEXT: cmplw 3, 4
; CHECK-NEXT: isel 3, 0, 5, 2
; CHECK-NEXT: clrldi 3, 3, 32
; CHECK-NEXT: xor 3, 3, 4
; CHECK-NEXT: cntlzw 3, 3
; CHECK-NEXT: srwi 3, 3, 5
; CHECK-NEXT: xori 3, 3, 1
; CHECK-NEXT: blr
%call = tail call signext i32 @memcmp(i8* %x, i8* %y, i64 4)
%not.cmp = icmp ne i32 %call, 0
Expand Down

0 comments on commit e7c5041

Please sign in to comment.