Skip to content

Commit

Permalink
[x86] Teach the backend to fold more read-modify-write memory operands
Browse files Browse the repository at this point in the history
to instructions.

These can't be reasonably matched in tablegen due to the handling of
flags, so we have to do this in C++ code. We only did it for `inc` and
`dec` historically, this starts fleshing that out to more interesting
instructions. Notably, this handles transfering operands to `add` and
`sub`.

Currently this forces them into a register. The next patch will add
support for keeping immediate operands as immediates. Then I'll extend
this beyond just `add` and `sub`.

I'm not super thrilled by the repeated switches in the code but
everything else I tried was really ugly or problematic.

Many thanks to Craig Topper for the suggestions about where to even
begin here and how to make this stuff work.

Differential Revision: https://reviews.llvm.org/D37130

llvm-svn: 311806
  • Loading branch information
chandlerc committed Aug 25, 2017
1 parent 2605381 commit 4b611a8
Show file tree
Hide file tree
Showing 6 changed files with 507 additions and 74 deletions.
126 changes: 73 additions & 53 deletions llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
Expand Up @@ -1932,42 +1932,6 @@ static bool hasNoSignedComparisonUses(SDNode *N) {
return true;
}

/// Get the appropriate X86 opcode for an in-memory arithmetic operation that
/// also sets flags.
///
/// FIXME: This is essentially re-implemneting a subset of the patterns for
/// these instructions. Instead, we should compute this from the patterns
/// somehow.
///
/// FIXME: Currently we only support integer operations.
///
/// If there is no X86 opcode, returns none.
static Optional<unsigned> getFusedLdStWithFlagsOpcode(EVT LdVT, unsigned Opc) {
auto SelectSize = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
unsigned Opc8) -> Optional<unsigned> {
switch (LdVT.getSimpleVT().SimpleTy) {
case MVT::i64:
return Opc64;
case MVT::i32:
return Opc32;
case MVT::i16:
return Opc16;
case MVT::i8:
return Opc8;
default:
return None;
}
};
switch (Opc) {
default:
return None;
case X86ISD::DEC:
return SelectSize(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
case X86ISD::INC:
return SelectSize(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m);
}
}

/// Check whether or not the chain ending in StoreNode is suitable for doing
/// the {load; op; store} to modify transformation.
static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
Expand Down Expand Up @@ -2047,15 +2011,16 @@ static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
return true;
}

// Change a chain of {load; incr or dec; store} of the same value into
// a simple increment or decrement through memory of that value, if the
// uses of the modified value and its address are suitable.
// The DEC64m tablegen pattern is currently not able to match the case where
// the EFLAGS on the original DEC are used. (This also applies to
// {INC,DEC}X{64,32,16,8}.)
// We'll need to improve tablegen to allow flags to be transferred from a
// node in the pattern to the result node. probably with a new keyword
// for example, we have this
// Change a chain of {load; op; store} of the same value into a simple op
// through memory of that value, if the uses of the modified value and its
// address are suitable.
//
// The tablegen pattern memory operand pattern is currently not able to match
// the case where the EFLAGS on the original operation are used.
//
// To move this to tablegen, we'll need to improve tablegen to allow flags to
// be transferred from a node in the pattern to the result node, probably with
// a new keyword. For example, we have this
// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
// (implicit EFLAGS)]>;
Expand All @@ -2064,19 +2029,29 @@ static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
// (transferrable EFLAGS)]>;
//
// FIXME: This should handle a wide range of operations which support RMW
// memory operands, not just inc and dec.
// Until then, we manually fold these and instruction select the operation
// here.
bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
SDValue StoredVal = StoreNode->getOperand(1);
unsigned Opc = StoredVal->getOpcode();

// Before we try to select anything, make sure this is memory operand size
// and opcode we can handle. Note that this must match the code below that
// actually lowers the opcodes.
EVT MemVT = StoreNode->getMemoryVT();
if (!MemVT.isSimple())
if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
MemVT != MVT::i8)
return false;
Optional<unsigned> NewOpc = getFusedLdStWithFlagsOpcode(MemVT, Opc);
if (!NewOpc)
switch (Opc) {
default:
return false;
case X86ISD::INC:
case X86ISD::DEC:
case X86ISD::ADD:
case X86ISD::SUB:
break;
}

LoadSDNode *LoadNode = nullptr;
SDValue InputChain;
Expand All @@ -2089,12 +2064,57 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
Segment))
return false;

auto SelectOpcodeForSize = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
unsigned Opc8) {
switch (MemVT.getSimpleVT().SimpleTy) {
case MVT::i64:
return Opc64;
case MVT::i32:
return Opc32;
case MVT::i16:
return Opc16;
case MVT::i8:
return Opc8;
default:
llvm_unreachable("Invalid size!");
}
};

MachineSDNode *Result;
switch (Opc) {
case X86ISD::INC:
case X86ISD::DEC: {
unsigned NewOpc = Opc == X86ISD::INC
? SelectOpcodeForSize(X86::INC64m, X86::INC32m,
X86::INC16m, X86::INC8m)
: SelectOpcodeForSize(X86::DEC64m, X86::DEC32m,
X86::DEC16m, X86::DEC8m);
const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
Result =
CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, Ops);
break;
}
case X86ISD::ADD:
case X86ISD::SUB: {
unsigned NewOpc = Opc == X86ISD::ADD
? SelectOpcodeForSize(X86::ADD64mr, X86::ADD32mr,
X86::ADD16mr, X86::ADD8mr)
: SelectOpcodeForSize(X86::SUB64mr, X86::SUB32mr,
X86::SUB16mr, X86::SUB8mr);
const SDValue Ops[] = {Base, Scale, Index,
Disp, Segment, StoredVal->getOperand(1),
InputChain};
Result =
CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, Ops);
break;
}
default:
llvm_unreachable("Invalid opcode!");
}

MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(2);
MemOp[0] = StoreNode->getMemOperand();
MemOp[1] = LoadNode->getMemOperand();
const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
MachineSDNode *Result =
CurDAG->getMachineNode(*NewOpc, SDLoc(Node), MVT::i32, MVT::Other, Ops);
Result->setMemRefs(MemOp, MemOp + 2);

ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
Expand Down
6 changes: 2 additions & 4 deletions llvm/test/CodeGen/X86/add.ll
Expand Up @@ -341,9 +341,8 @@ define void @test12(i64* inreg %a) nounwind {
; X32-LABEL: test12:
; X32: # BB#0: # %entry
; X32-NEXT: movl $-2147483648, %ecx # imm = 0x80000000
; X32-NEXT: addl (%eax), %ecx
; X32-NEXT: addl %ecx, (%eax)
; X32-NEXT: adcl $0, 4(%eax)
; X32-NEXT: movl %ecx, (%eax)
; X32-NEXT: retl
;
; X64-LINUX-LABEL: test12:
Expand All @@ -366,9 +365,8 @@ define void @test13(i64* inreg %a) nounwind {
; X32-LABEL: test13:
; X32: # BB#0: # %entry
; X32-NEXT: movl $128, %ecx
; X32-NEXT: addl (%eax), %ecx
; X32-NEXT: addl %ecx, (%eax)
; X32-NEXT: adcl $0, 4(%eax)
; X32-NEXT: movl %ecx, (%eax)
; X32-NEXT: retl
;
; X64-LINUX-LABEL: test13:
Expand Down
3 changes: 1 addition & 2 deletions llvm/test/CodeGen/X86/addcarry.ll
Expand Up @@ -171,8 +171,7 @@ define void @muladd(%accumulator* nocapture %this, i64 %arg.a, i64 %arg.b) {
; CHECK: # BB#0: # %entry
; CHECK-NEXT: movq %rdx, %rax
; CHECK-NEXT: mulq %rsi
; CHECK-NEXT: addq (%rdi), %rax
; CHECK-NEXT: movq %rax, (%rdi)
; CHECK-NEXT: addq %rax, (%rdi)
; CHECK-NEXT: adcq 8(%rdi), %rdx
; CHECK-NEXT: movq %rdx, 8(%rdi)
; CHECK-NEXT: adcl $0, 16(%rdi)
Expand Down

0 comments on commit 4b611a8

Please sign in to comment.