Skip to content

Commit

Permalink
[CodeGenPrepare] limit overflow intrinsic matching to a single basic …
Browse files Browse the repository at this point in the history
…block

Using/updating a dominator tree to match math overflow patterns may be very
expensive in compile-time (because of the way CGP uses a DT), so just handle
the single-block case.

Also, we were restarting the iterator loops when doing the overflow intrinsic
transforms by marking the dominator tree for update. That was done to prevent
iterating over a removed instruction. But we can postpone the deletion using
the existing "RemovedInsts" structure, and that means we don't need to update
the DT.

See post-commit thread for rL354298 for more details:
http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20190422/646276.html

Differential Revision: https://reviews.llvm.org/D61075

llvm-svn: 359879
  • Loading branch information
rotateright authored and MrSidims committed May 17, 2019
1 parent 18cb28e commit 99f8ad0
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 68 deletions.
89 changes: 42 additions & 47 deletions llvm/lib/CodeGen/CodeGenPrepare.cpp
Expand Up @@ -388,9 +388,9 @@ class TypePromotionTransaction;
bool tryToSinkFreeOperands(Instruction *I);
bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, CmpInst *Cmp,
Intrinsic::ID IID);
bool optimizeCmp(CmpInst *Cmp, bool &ModifiedDT);
bool combineToUSubWithOverflow(CmpInst *Cmp, bool &ModifiedDT);
bool combineToUAddWithOverflow(CmpInst *Cmp, bool &ModifiedDT);
bool optimizeCmp(CmpInst *Cmp);
bool combineToUSubWithOverflow(CmpInst *Cmp);
bool combineToUAddWithOverflow(CmpInst *Cmp);
};

} // end anonymous namespace
Expand Down Expand Up @@ -484,9 +484,13 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
if (!LargeOffsetGEPMap.empty())
MadeChange |= splitLargeGEPOffsets();

// Really free removed instructions during promotion.
for (Instruction *I : RemovedInsts)
// Really free instructions removed during promotion or kept around to
// improve efficiency (see comments in overflow intrinsic transforms).
for (Instruction *I : RemovedInsts) {
if (I->getParent())
I->removeFromParent();
I->deleteValue();
}

EverMadeChange |= MadeChange;
SeenChainsForSExt.clear();
Expand Down Expand Up @@ -1177,6 +1181,20 @@ static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI,
bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
CmpInst *Cmp,
Intrinsic::ID IID) {
if (BO->getParent() != Cmp->getParent()) {
// We used to use a dominator tree here to allow multi-block optimization.
// But that was problematic because:
// 1. It could cause a perf regression by hoisting the math op into the
// critical path.
// 2. It could cause a perf regression by creating a value that was live
// across multiple blocks and increasing register pressure.
// 3. Use of a dominator tree could cause large compile-time regression.
// This is because we recompute the DT on every change in the main CGP
// run-loop. The recomputing is probably unnecessary in many cases, so if
// that was fixed, using a DT here would be ok.
return false;
}

// We allow matching the canonical IR (add X, C) back to (usubo X, -C).
Value *Arg0 = BO->getOperand(0);
Value *Arg1 = BO->getOperand(1);
Expand All @@ -1186,45 +1204,28 @@ bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
Arg1 = ConstantExpr::getNeg(cast<Constant>(Arg1));
}

Instruction *InsertPt;
if (BO->hasOneUse() && BO->user_back() == Cmp) {
// If the math is only used by the compare, insert at the compare to keep
// the condition in the same block as its users. (CGP aggressively sinks
// compares to help out SDAG.)
InsertPt = Cmp;
} else {
// The math and compare may be independent instructions. Check dominance to
// determine the insertion point for the intrinsic.
bool MathDominates = getDT(*BO->getFunction()).dominates(BO, Cmp);
if (!MathDominates && !getDT(*BO->getFunction()).dominates(Cmp, BO))
return false;

BasicBlock *MathBB = BO->getParent(), *CmpBB = Cmp->getParent();
if (MathBB != CmpBB) {
// Avoid hoisting an extra op into a dominating block and creating a
// potentially longer critical path.
if (!MathDominates)
return false;
// Check that the insertion doesn't create a value that is live across
// more than two blocks, so to minimise the increase in register pressure.
BasicBlock *Dominator = MathDominates ? MathBB : CmpBB;
BasicBlock *Dominated = MathDominates ? CmpBB : MathBB;
auto Successors = successors(Dominator);
if (llvm::find(Successors, Dominated) == Successors.end())
return false;
// Insert at the first instruction of the pair.
Instruction *InsertPt = nullptr;
for (Instruction &Iter : *Cmp->getParent()) {
if (&Iter == BO || &Iter == Cmp) {
InsertPt = &Iter;
break;
}

InsertPt = MathDominates ? cast<Instruction>(BO) : cast<Instruction>(Cmp);
}
assert(InsertPt != nullptr && "Parent block did not contain cmp or binop");

IRBuilder<> Builder(InsertPt);
Value *MathOV = Builder.CreateBinaryIntrinsic(IID, Arg0, Arg1);
Value *Math = Builder.CreateExtractValue(MathOV, 0, "math");
Value *OV = Builder.CreateExtractValue(MathOV, 1, "ov");

// Delay the actual removal/deletion of the binop and compare for efficiency.
// If we delete those now, we would invalidate the instruction iterator and
// trigger dominator tree updates.
BO->replaceAllUsesWith(Math);
Cmp->replaceAllUsesWith(OV);
BO->eraseFromParent();
Cmp->eraseFromParent();
RemovedInsts.insert(BO);
RemovedInsts.insert(Cmp);
return true;
}

Expand Down Expand Up @@ -1260,8 +1261,7 @@ static bool matchUAddWithOverflowConstantEdgeCases(CmpInst *Cmp,

/// Try to combine the compare into a call to the llvm.uadd.with.overflow
/// intrinsic. Return true if any changes were made.
bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp,
bool &ModifiedDT) {
bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp) {
Value *A, *B;
BinaryOperator *Add;
if (!match(Cmp, m_UAddWithOverflow(m_Value(A), m_Value(B), m_BinOp(Add))))
Expand All @@ -1281,13 +1281,10 @@ bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp,
if (!replaceMathCmpWithIntrinsic(Add, Cmp, Intrinsic::uadd_with_overflow))
return false;

// Reset callers - do not crash by iterating over a dead instruction.
ModifiedDT = true;
return true;
}

bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
bool &ModifiedDT) {
bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp) {
// We are not expecting non-canonical/degenerate code. Just bail out.
Value *A = Cmp->getOperand(0), *B = Cmp->getOperand(1);
if (isa<Constant>(A) && isa<Constant>(B))
Expand Down Expand Up @@ -1342,8 +1339,6 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
if (!replaceMathCmpWithIntrinsic(Sub, Cmp, Intrinsic::usub_with_overflow))
return false;

// Reset callers - do not crash by iterating over a dead instruction.
ModifiedDT = true;
return true;
}

Expand Down Expand Up @@ -1413,14 +1408,14 @@ static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) {
return MadeChange;
}

bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, bool &ModifiedDT) {
bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp) {
if (sinkCmpExpression(Cmp, *TLI))
return true;

if (combineToUAddWithOverflow(Cmp, ModifiedDT))
if (combineToUAddWithOverflow(Cmp))
return true;

if (combineToUSubWithOverflow(Cmp, ModifiedDT))
if (combineToUSubWithOverflow(Cmp))
return true;

return false;
Expand Down Expand Up @@ -6945,7 +6940,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
}

if (auto *Cmp = dyn_cast<CmpInst>(I))
if (TLI && optimizeCmp(Cmp, ModifiedDT))
if (TLI && optimizeCmp(Cmp))
return true;

if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
Expand Down
15 changes: 9 additions & 6 deletions llvm/test/CodeGen/X86/cgp-usubo.ll
Expand Up @@ -121,7 +121,7 @@ define i1 @usubo_ne_constant0_op1_i32(i32 %x, i32* %p) {
ret i1 %ov
}

; Verify insertion point for multi-BB.
; This used to verify insertion point for multi-BB, but now we just bail out.

declare void @call(i1)

Expand All @@ -131,14 +131,17 @@ define i1 @usubo_ult_sub_dominates_i64(i64 %x, i64 %y, i64* %p, i1 %cond) nounwi
; CHECK-NEXT: testb $1, %cl
; CHECK-NEXT: je .LBB8_2
; CHECK-NEXT: # %bb.1: # %t
; CHECK-NEXT: subq %rsi, %rdi
; CHECK-NEXT: setb %al
; CHECK-NEXT: movq %rdi, (%rdx)
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: subq %rsi, %rax
; CHECK-NEXT: movq %rax, (%rdx)
; CHECK-NEXT: testb $1, %cl
; CHECK-NEXT: jne .LBB8_3
; CHECK-NEXT: je .LBB8_2
; CHECK-NEXT: # %bb.3: # %end
; CHECK-NEXT: cmpq %rsi, %rdi
; CHECK-NEXT: setb %al
; CHECK-NEXT: retq
; CHECK-NEXT: .LBB8_2: # %f
; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: .LBB8_3: # %end
; CHECK-NEXT: retq
entry:
br i1 %cond, label %t, label %f
Expand Down
9 changes: 4 additions & 5 deletions llvm/test/Transforms/CodeGenPrepare/X86/optimizeSelect-DT.ll
Expand Up @@ -14,11 +14,10 @@ define i1 @PR41004(i32 %x, i32 %y, i32 %t1) {
; CHECK-NEXT: br label [[SELECT_END]]
; CHECK: select.end:
; CHECK-NEXT: [[MUL:%.*]] = phi i32 [ [[REM]], [[SELECT_TRUE_SINK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[TMP0:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[T1:%.*]], i32 1)
; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i32, i1 } [[TMP0]], 0
; CHECK-NEXT: [[OV:%.*]] = extractvalue { i32, i1 } [[TMP0]], 1
; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MATH]], [[MUL]]
; CHECK-NEXT: ret i1 [[OV]]
; CHECK-NEXT: [[NEG:%.*]] = add i32 [[T1:%.*]], -1
; CHECK-NEXT: [[ADD:%.*]] = add i32 [[NEG]], [[MUL]]
; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[T1]], 0
; CHECK-NEXT: ret i1 [[TOBOOL]]
;
entry:
%rem = srem i32 %x, 2
Expand Down
20 changes: 10 additions & 10 deletions llvm/test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll
Expand Up @@ -47,15 +47,16 @@ define i64 @uaddo3(i64 %a, i64 %b) nounwind ssp {
ret i64 %Q
}

; TODO? CGP sinks the compare before we have a chance to form the overflow intrinsic.

define i64 @uaddo4(i64 %a, i64 %b, i1 %c) nounwind ssp {
; CHECK-LABEL: @uaddo4(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ADD:%.*]] = add i64 [[B:%.*]], [[A:%.*]]
; CHECK-NEXT: br i1 [[C:%.*]], label [[NEXT:%.*]], label [[EXIT:%.*]]
; CHECK: next:
; CHECK-NEXT: [[TMP0:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[B:%.*]], i64 [[A:%.*]])
; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
; CHECK-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42
; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[B]], [[ADD]]
; CHECK-NEXT: [[Q:%.*]] = select i1 [[TMP0]], i64 [[B]], i64 42
; CHECK-NEXT: ret i64 [[Q]]
; CHECK: exit:
; CHECK-NEXT: ret i64 0
Expand Down Expand Up @@ -362,7 +363,7 @@ define i1 @usubo_ne_constant0_op1_i32(i32 %x, i32* %p) {
ret i1 %ov
}

; Verify insertion point for multi-BB.
; This used to verify insertion point for multi-BB, but now we just bail out.

declare void @call(i1)

Expand All @@ -371,15 +372,14 @@ define i1 @usubo_ult_sub_dominates_i64(i64 %x, i64 %y, i64* %p, i1 %cond) {
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 [[COND:%.*]], label [[T:%.*]], label [[F:%.*]]
; CHECK: t:
; CHECK-NEXT: [[TMP0:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[X:%.*]], i64 [[Y:%.*]])
; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
; CHECK-NEXT: [[OV1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
; CHECK-NEXT: store i64 [[MATH]], i64* [[P:%.*]]
; CHECK-NEXT: [[S:%.*]] = sub i64 [[X:%.*]], [[Y:%.*]]
; CHECK-NEXT: store i64 [[S]], i64* [[P:%.*]]
; CHECK-NEXT: br i1 [[COND]], label [[END:%.*]], label [[F]]
; CHECK: f:
; CHECK-NEXT: ret i1 [[COND]]
; CHECK: end:
; CHECK-NEXT: ret i1 [[OV1]]
; CHECK-NEXT: [[OV:%.*]] = icmp ult i64 [[X]], [[Y]]
; CHECK-NEXT: ret i1 [[OV]]
;
entry:
br i1 %cond, label %t, label %f
Expand Down

0 comments on commit 99f8ad0

Please sign in to comment.