Skip to content

Commit

Permalink
Correct load-op-store cycle detection analysis
Browse files Browse the repository at this point in the history
Add missing cycle dependency checks in load-op-store fusion.

Fixes PR36274.

Reviewers: craig.topper, bogner

Subscribers: hiraditya, llvm-commits

Differential Revision: https://reviews.llvm.org/D43154

llvm-svn: 327172
  • Loading branch information
niravhdave committed Mar 9, 2018
1 parent d668f69 commit 0fab417
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 39 deletions.
101 changes: 62 additions & 39 deletions llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
Expand Up @@ -2109,27 +2109,56 @@ static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
LoadNode->getOffset() != StoreNode->getOffset())
return false;

// Check if the chain is produced by the load or is a TokenFactor with
// the load output chain as an operand. Return InputChain by reference.
bool FoundLoad = false;
SmallVector<SDValue, 4> ChainOps;
SmallVector<const SDNode *, 4> LoopWorklist;
SmallPtrSet<const SDNode *, 16> Visited;
const unsigned int Max = 1024;

// Visualization of Load-Op-Store fusion:
// -------------------------
// Legend:
// *-lines = Chain operand dependencies.
// |-lines = Normal operand dependencies.
// Dependencies flow down and right. n-suffix references multiple nodes.
//
// C Xn C
// * * *
// * * *
// Xn A-LD Yn TF Yn
// * * \ | * |
// * * \ | * |
// * * \ | => A--LD_OP_ST
// * * \| \
// TF OP \
// * | \ Zn
// * | \
// A-ST Zn
//

// This merge induced dependences from: #1: Xn -> LD, OP, Zn
// #2: Yn -> LD
// #3: ST -> Zn

// Ensure the transform is safe by checking for the dual
// dependencies to make sure we do not induce a loop.

// As LD is a predecessor to both OP and ST we can do this by checking:
// a). if LD is a predecessor to a member of Xn or Yn.
// b). if a Zn is a predecessor to ST.

// However, (b) can only occur through being a chain predecessor to
// ST, which is the same as Zn being a member or predecessor of Xn,
// which is a subset of LD being a predecessor of Xn. So it's
// subsumed by check (a).

SDValue Chain = StoreNode->getChain();

// Gather X elements in ChainOps.
if (Chain == Load.getValue(1)) {
InputChain = LoadNode->getChain();
return true;
}

if (Chain.getOpcode() == ISD::TokenFactor) {
// Fusing Load-Op-Store requires predecessors of store must also
// be predecessors of the load. This addition may cause a loop. We
// can check this by doing a search for Load in the new
// dependencies. As this can be expensive, heuristically prune
// this search by visiting the uses and make sure they all have
// smaller node id than the load.

bool FoundLoad = false;
SmallVector<SDValue, 4> ChainOps;
SmallVector<const SDNode *, 4> LoopWorklist;
SmallPtrSet<const SDNode *, 16> Visited;
FoundLoad = true;
ChainOps.push_back(Load.getOperand(0));
} else if (Chain.getOpcode() == ISD::TokenFactor) {
for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
SDValue Op = Chain.getOperand(i);
if (Op == Load.getValue(1)) {
Expand All @@ -2141,31 +2170,25 @@ static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
LoopWorklist.push_back(Op.getNode());
ChainOps.push_back(Op);
}
}

if (!FoundLoad)
return false;
if (!FoundLoad)
return false;

// If Loop Worklist is not empty. Check if we would make a loop.
if (!LoopWorklist.empty()) {
const unsigned int Max = 8192;
// if Load is predecessor to potentially loop inducing chain
// dependencies.
if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist,
Max, true))
return false;
// Fail conservatively if we ended loop search early.
if (Visited.size() >= Max)
return false;
}
// Worklist is currently Xn. Add Yn to worklist.
for (SDValue Op : StoredVal->ops())
if (Op.getNode() != LoadNode)
LoopWorklist.push_back(Op.getNode());

// Make a new TokenFactor with all the other input chains except
// for the load.
InputChain =
CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
return true;
// Check (a) if Load is a predecessor to Xn + Yn
if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
true))
return false;

InputChain =
CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
return true;
}
return false;
}

// Change a chain of {load; op; store} of the same value into a simple op
// through memory of that value, if the uses of the modified value and its
Expand Down
33 changes: 33 additions & 0 deletions llvm/test/CodeGen/X86/pr36274.ll
@@ -0,0 +1,33 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck %s

; This tests is checking for a case where the x86 load-op-store fusion
; misses a dependence between the fused load and a non-fused operand
; to the load causing a cycle. Here the dependence in question comes
; from the carry in input of the adcl.

@vx = external local_unnamed_addr global <2 x i32>, align 8

define void @pr36274(i32* %somewhere) {
; CHECK-LABEL: pr36274:
; CHECK: # %bb.0:
; CHECK-NEXT: movl vx+4, %eax
; CHECK-NEXT: addl $1, vx
; CHECK-NEXT: adcl $0, %eax
; CHECK-NEXT: movl %eax, vx+4
; CHECK-NEXT: retl
%a0 = getelementptr <2 x i32>, <2 x i32>* @vx, i32 0, i32 0
%a1 = getelementptr <2 x i32>, <2 x i32>* @vx, i32 0, i32 1
%x1 = load volatile i32, i32* %a1, align 4
%x0 = load volatile i32, i32* %a0, align 8
%vx0 = insertelement <2 x i32> undef, i32 %x0, i32 0
%vx1 = insertelement <2 x i32> %vx0, i32 %x1, i32 1
%x = bitcast <2 x i32> %vx1 to i64
%add = add i64 %x, 1
%vadd = bitcast i64 %add to <2 x i32>
%vx1_0 = extractelement <2 x i32> %vadd, i32 0
%vx1_1 = extractelement <2 x i32> %vadd, i32 1
store i32 %vx1_0, i32* %a0, align 8
store i32 %vx1_1, i32* %a1, align 4
ret void
}

0 comments on commit 0fab417

Please sign in to comment.