Skip to content

Commit

Permalink
[X86] Improve lowering of idemptotent RMW operations
Browse files Browse the repository at this point in the history
The current lowering uses an mfence. mfences are substaintially higher latency than the locked operations originally requested, but we do want to avoid contention on the original cache line. As such, use a locked instruction on a cache line assumed to be thread local.

Differential Revision: https://reviews.llvm.org/D58632

llvm-svn: 360393
  • Loading branch information
preames authored and MrSidims committed May 24, 2019
1 parent f81b7ac commit a7e6d75
Show file tree
Hide file tree
Showing 2 changed files with 111 additions and 66 deletions.
89 changes: 88 additions & 1 deletion llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -25727,6 +25727,14 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
if (MemType->getPrimitiveSizeInBits() > NativeWidth)
return nullptr;

// If this is a canonical idempotent atomicrmw w/no uses, we have a better
// lowering available in lowerAtomicArith.
// TODO: push more cases through this path.
if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
AI->use_empty())
return nullptr;

auto Builder = IRBuilder<>(AI);
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
auto SSID = AI->getSyncScopeID();
Expand Down Expand Up @@ -26223,6 +26231,59 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
}

/// Emit a locked operation on a stack location which does not change any
/// memory location, but does involve a lock prefix. Location is chosen to be
/// a) very likely accessed only by a single thread to minimize cache traffic,
/// and b) definitely dereferenceable. Returns the new Chain result.
static SDValue emitLockedStackOp(SelectionDAG &DAG,
const X86Subtarget &Subtarget,
SDValue Chain, SDLoc DL) {
// Implementation notes:
// 1) LOCK prefix creates a full read/write reordering barrier for memory
// operations issued by the current processor. As such, the location
// referenced is not relevant for the ordering properties of the instruction.
// See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
// 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
// 2) Using an immediate operand appears to be the best encoding choice
// here since it doesn't require an extra register.
// 3) OR appears to be very slightly faster than ADD. (Though, the difference
// is small enough it might just be measurement noise.)
// 4) For the moment, we are using top of stack. This creates false sharing
// with actual stack access/call sequences, and it would be better to use a
// location within the redzone. For the moment, this is still better than an
// mfence though. TODO: Revise the offset used when we can assume a redzone.
//
// For a general discussion of the tradeoffs and benchmark results, see:
// https://shipilev.net/blog/2014/on-the-fence-with-dependencies/

if (Subtarget.is64Bit()) {
SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i8);
SDValue Ops[] = {
DAG.getRegister(X86::RSP, MVT::i64), // Base
DAG.getTargetConstant(1, DL, MVT::i8), // Scale
DAG.getRegister(0, MVT::i64), // Index
DAG.getTargetConstant(0, DL, MVT::i32), // Disp
DAG.getRegister(0, MVT::i32), // Segment.
Zero,
Chain};
SDNode *Res = DAG.getMachineNode(X86::LOCK_OR32mi8, DL, MVT::Other, Ops);
return SDValue(Res, 0);
}

SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
SDValue Ops[] = {
DAG.getRegister(X86::ESP, MVT::i32), // Base
DAG.getTargetConstant(1, DL, MVT::i8), // Scale
DAG.getRegister(0, MVT::i32), // Index
DAG.getTargetConstant(0, DL, MVT::i32), // Disp
DAG.getRegister(0, MVT::i32), // Segment.
Zero,
Chain
};
SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::Other, Ops);
return SDValue(Res, 0);
}

static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
unsigned NewOpc = 0;
Expand Down Expand Up @@ -26257,6 +26318,7 @@ static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
/// Lower atomic_load_ops into LOCK-prefixed operations.
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
SDValue Chain = N->getOperand(0);
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
Expand All @@ -26271,7 +26333,6 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
// Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
// select LXADD if LOCK_SUB can't be selected.
if (Opc == ISD::ATOMIC_LOAD_SUB) {
AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
RHS, AN->getMemOperand());
Expand All @@ -26281,6 +26342,32 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
return N;
}

// Specialized lowering for the canonical form of an idemptotent atomicrmw.
// The core idea here is that since the memory location isn't actually
// changing, all we need is a lowering for the *ordering* impacts of the
// atomicrmw. As such, we can chose a different operation and memory
// location to minimize impact on other code.
if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
// On X86, the only ordering which actually requires an instruction is
// seq_cst which isn't SingleThread, everything just needs to be preserved
// during codegen and then dropped. Note that we expect (but don't assume),
// that orderings other than seq_cst and acq_rel have been canonicalized to
// a store or load.
if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent &&
AN->getSyncScopeID() == SyncScope::System) {
// Prefer a locked operation against a stack location to minimize cache
// traffic. This assumes that stack locations are very likely to be
// accessed only by the owning thread.
SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
DAG.ReplaceAllUsesOfValueWith(N.getValue(1), NewChain);
return SDValue();
}
// MEMBARRIER is a compiler barrier; it codegens to a no-op.
SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
DAG.ReplaceAllUsesOfValueWith(N.getValue(1), NewChain);
return SDValue();
}

SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
// RAUW the chain, but don't worry about the result, as it's unused.
assert(!N->hasAnyUseOfValue(0));
Expand Down
88 changes: 23 additions & 65 deletions llvm/test/CodeGen/X86/atomic-idempotent.ll
Expand Up @@ -166,86 +166,51 @@ define i32 @and32 (i32* %p) {
}

define void @or32_nouse_monotonic(i32* %p) {
; X64-LABEL: or32_nouse_monotonic:
; X64: # %bb.0:
; X64-NEXT: mfence
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: retq
;
; X32-LABEL: or32_nouse_monotonic:
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: mfence
; X32-NEXT: movl (%eax), %eax
; X32-NEXT: retl
; CHECK-LABEL: or32_nouse_monotonic:
; CHECK: # %bb.0:
; CHECK-NEXT: #MEMBARRIER
; CHECK-NEXT: ret{{[l|q]}}
atomicrmw or i32* %p, i32 0 monotonic
ret void
}


define void @or32_nouse_acquire(i32* %p) {
; X64-LABEL: or32_nouse_acquire:
; X64: # %bb.0:
; X64-NEXT: mfence
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: retq
;
; X32-LABEL: or32_nouse_acquire:
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: mfence
; X32-NEXT: movl (%eax), %eax
; X32-NEXT: retl
; CHECK-LABEL: or32_nouse_acquire:
; CHECK: # %bb.0:
; CHECK-NEXT: #MEMBARRIER
; CHECK-NEXT: ret{{[l|q]}}
atomicrmw or i32* %p, i32 0 acquire
ret void
}

define void @or32_nouse_release(i32* %p) {
; X64-LABEL: or32_nouse_release:
; X64: # %bb.0:
; X64-NEXT: mfence
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: retq
;
; X32-LABEL: or32_nouse_release:
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: mfence
; X32-NEXT: movl (%eax), %eax
; X32-NEXT: retl
; CHECK-LABEL: or32_nouse_release:
; CHECK: # %bb.0:
; CHECK-NEXT: #MEMBARRIER
; CHECK-NEXT: ret{{[l|q]}}
atomicrmw or i32* %p, i32 0 release
ret void
}

define void @or32_nouse_acq_rel(i32* %p) {
; X64-LABEL: or32_nouse_acq_rel:
; X64: # %bb.0:
; X64-NEXT: mfence
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: retq
;
; X32-LABEL: or32_nouse_acq_rel:
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: mfence
; X32-NEXT: movl (%eax), %eax
; X32-NEXT: retl
; CHECK-LABEL: or32_nouse_acq_rel:
; CHECK: # %bb.0:
; CHECK-NEXT: #MEMBARRIER
; CHECK-NEXT: ret{{[l|q]}}
atomicrmw or i32* %p, i32 0 acq_rel
ret void
}

define void @or32_nouse_seq_cst(i32* %p) {
; X64-LABEL: or32_nouse_seq_cst:
; X64: # %bb.0:
; X64-NEXT: mfence
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: lock orl $0, (%rsp)
; X64-NEXT: retq
;
; X32-LABEL: or32_nouse_seq_cst:
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: mfence
; X32-NEXT: movl (%eax), %eax
; X32-NEXT: lock orl $0, (%esp)
; X32-NEXT: retl
atomicrmw or i32* %p, i32 0 seq_cst
ret void
Expand All @@ -255,8 +220,7 @@ define void @or32_nouse_seq_cst(i32* %p) {
define void @or64_nouse_seq_cst(i64* %p) {
; X64-LABEL: or64_nouse_seq_cst:
; X64: # %bb.0:
; X64-NEXT: mfence
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: lock orl $0, (%rsp)
; X64-NEXT: retq
;
; X32-LABEL: or64_nouse_seq_cst:
Expand Down Expand Up @@ -334,15 +298,12 @@ define void @or128_nouse_seq_cst(i128* %p) {
define void @or16_nouse_seq_cst(i16* %p) {
; X64-LABEL: or16_nouse_seq_cst:
; X64: # %bb.0:
; X64-NEXT: mfence
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: lock orl $0, (%rsp)
; X64-NEXT: retq
;
; X32-LABEL: or16_nouse_seq_cst:
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: mfence
; X32-NEXT: movzwl (%eax), %eax
; X32-NEXT: lock orl $0, (%esp)
; X32-NEXT: retl
atomicrmw or i16* %p, i16 0 seq_cst
ret void
Expand All @@ -351,15 +312,12 @@ define void @or16_nouse_seq_cst(i16* %p) {
define void @or8_nouse_seq_cst(i8* %p) {
; X64-LABEL: or8_nouse_seq_cst:
; X64: # %bb.0:
; X64-NEXT: mfence
; X64-NEXT: movb (%rdi), %al
; X64-NEXT: lock orl $0, (%rsp)
; X64-NEXT: retq
;
; X32-LABEL: or8_nouse_seq_cst:
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: mfence
; X32-NEXT: movb (%eax), %al
; X32-NEXT: lock orl $0, (%esp)
; X32-NEXT: retl
atomicrmw or i8* %p, i8 0 seq_cst
ret void
Expand Down

0 comments on commit a7e6d75

Please sign in to comment.