Skip to content

Commit

Permalink
Lower idempotent RMWs to fence+load
Browse files Browse the repository at this point in the history
Summary:
I originally tried doing this specifically for X86 in the backend in D5091,
but it was rather brittle and generally running too late to be general.
Furthermore, other targets may want to implement similar optimizations.
So I reimplemented it at the IR-level, fitting it into AtomicExpandPass
as it interacts with that pass (which could not be cleanly done before
at the backend level).

This optimization relies on a new target hook, which is only used by X86
for now, as the correctness of the optimization on other targets remains
an open question. If it is found correct on other targets, it should be
trivial to enable for them.

Details of the optimization are discussed in D5091.

Test Plan: make check-all + a new test

Reviewers: jfb

Subscribers: llvm-commits

Differential Revision: http://reviews.llvm.org/D5422

llvm-svn: 218455
  • Loading branch information
Robin Morisset committed Sep 25, 2014
1 parent 8cb2cae commit 810739d
Show file tree
Hide file tree
Showing 5 changed files with 185 additions and 6 deletions.
14 changes: 14 additions & 0 deletions llvm/include/llvm/Target/TargetLowering.h
Expand Up @@ -1008,6 +1008,20 @@ class TargetLoweringBase {
return false;
}

/// On some platforms, an AtomicRMW that never actually modifies the value
/// (such as fetch_add of 0) can be turned into a fence followed by an
/// atomic load. This may sound useless, but it makes it possible for the
/// processor to keep the cacheline shared, dramatically improving
/// performance. And such idempotent RMWs are useful for implementing some
/// kinds of locks, see for example (justification + benchmarks):
/// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf
/// This method tries doing that transformation, returning the atomic load if
/// it succeeds, and nullptr otherwise.
/// If shouldExpandAtomicLoadInIR returns true on that load, it will undergo
/// another round of expansion.
virtual LoadInst *lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *RMWI) const {
return nullptr;
}
//===--------------------------------------------------------------------===//
// TargetLowering Configuration Methods - These methods should be invoked by
// the derived class constructor to configure this object for the target.
Expand Down
45 changes: 43 additions & 2 deletions llvm/lib/CodeGen/AtomicExpandPass.cpp
Expand Up @@ -51,6 +51,8 @@ namespace {
bool expandAtomicRMWToLLSC(AtomicRMWInst *AI);
bool expandAtomicRMWToCmpXchg(AtomicRMWInst *AI);
bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
bool isIdempotentRMW(AtomicRMWInst *AI);
bool simplifyIdempotentRMW(AtomicRMWInst *AI);
};
}

Expand Down Expand Up @@ -127,8 +129,15 @@ bool AtomicExpand::runOnFunction(Function &F) {
MadeChange |= expandAtomicLoad(LI);
} else if (SI && TargetLowering->shouldExpandAtomicStoreInIR(SI)) {
MadeChange |= expandAtomicStore(SI);
} else if (RMWI && TargetLowering->shouldExpandAtomicRMWInIR(RMWI)) {
MadeChange |= expandAtomicRMW(RMWI);
} else if (RMWI) {
// There are two different ways of expanding RMW instructions:
// - into a load if it is idempotent
// - into a Cmpxchg/LL-SC loop otherwise
// we try them in that order.
MadeChange |= (isIdempotentRMW(RMWI) &&
simplifyIdempotentRMW(RMWI)) ||
(TargetLowering->shouldExpandAtomicRMWInIR(RMWI) &&
expandAtomicRMW(RMWI));
} else if (CASI && TargetLowering->hasLoadLinkedStoreConditional()) {
MadeChange |= expandAtomicCmpXchg(CASI);
}
Expand Down Expand Up @@ -520,3 +529,35 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
CI->eraseFromParent();
return true;
}

bool AtomicExpand::isIdempotentRMW(AtomicRMWInst* RMWI) {
auto C = dyn_cast<ConstantInt>(RMWI->getValOperand());
if(!C)
return false;

AtomicRMWInst::BinOp Op = RMWI->getOperation();
switch(Op) {
case AtomicRMWInst::Add:
case AtomicRMWInst::Sub:
case AtomicRMWInst::Or:
case AtomicRMWInst::Xor:
return C->isZero();
case AtomicRMWInst::And:
return C->isMinusOne();
// FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/...
default:
return false;
}
}

bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) {
auto TLI = TM->getSubtargetImpl()->getTargetLowering();

if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) {
if (TLI->shouldExpandAtomicLoadInIR(ResultingLoad))
expandAtomicLoad(ResultingLoad);
return true;
}

return false;
}
73 changes: 69 additions & 4 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -17802,6 +17802,74 @@ bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
}
}

static bool hasMFENCE(const X86Subtarget& Subtarget) {
// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
// no-sse2). There isn't any reason to disable it if the target processor
// supports it.
return Subtarget.hasSSE2() || Subtarget.is64Bit();
}

LoadInst *
X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
const X86Subtarget &Subtarget =
getTargetMachine().getSubtarget<X86Subtarget>();
unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
const Type *MemType = AI->getType();
// Accesses larger than the native width are turned into cmpxchg/libcalls, so
// there is no benefit in turning such RMWs into loads, and it is actually
// harmful as it introduces a mfence.
if (MemType->getPrimitiveSizeInBits() > NativeWidth)
return nullptr;

auto Builder = IRBuilder<>(AI);
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
auto SynchScope = AI->getSynchScope();
// We must restrict the ordering to avoid generating loads with Release or
// ReleaseAcquire orderings.
auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
auto Ptr = AI->getPointerOperand();

// Before the load we need a fence. Here is an example lifted from
// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
// is required:
// Thread 0:
// x.store(1, relaxed);
// r1 = y.fetch_add(0, release);
// Thread 1:
// y.fetch_add(42, acquire);
// r2 = x.load(relaxed);
// r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
// lowered to just a load without a fence. A mfence flushes the store buffer,
// making the optimization clearly correct.
// FIXME: it is required if isAtLeastRelease(Order) but it is not clear
// otherwise, we might be able to be more agressive on relaxed idempotent
// rmw. In practice, they do not look useful, so we don't try to be
// especially clever.
if (SynchScope == SingleThread) {
// FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
// the IR level, so we must wrap it in an intrinsic.
return nullptr;
} else if (hasMFENCE(Subtarget)) {
Function *MFence = llvm::Intrinsic::getDeclaration(M,
Intrinsic::x86_sse2_mfence);
Builder.CreateCall(MFence);
} else {
// FIXME: it might make sense to use a locked operation here but on a
// different cache-line to prevent cache-line bouncing. In practice it
// is probably a small win, and x86 processors without mfence are rare
// enough that we do not bother.
return nullptr;
}

// Finally we can emit the atomic load.
LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
AI->getType()->getPrimitiveSizeInBits());
Loaded->setAtomic(Order, SynchScope);
AI->replaceAllUsesWith(Loaded);
AI->eraseFromParent();
return Loaded;
}

static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
Expand All @@ -17813,10 +17881,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
// The only fence that needs an instruction is a sequentially-consistent
// cross-thread fence.
if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
// no-sse2). There isn't any reason to disable it if the target processor
// supports it.
if (Subtarget->hasSSE2() || Subtarget->is64Bit())
if (hasMFENCE(*Subtarget))
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));

SDValue Chain = Op.getOperand(0);
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.h
Expand Up @@ -965,6 +965,9 @@ namespace llvm {
bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
bool shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;

LoadInst *
lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;

bool needsCmpXchgNb(const Type *MemType) const;

/// Utility function to emit atomic-load-arith operations (and, or, xor,
Expand Down
56 changes: 56 additions & 0 deletions llvm/test/CodeGen/X86/atomic_idempotent.ll
@@ -0,0 +1,56 @@
; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X64
; RUN: llc < %s -march=x86 -mattr=+sse2 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X32

; On x86, an atomic rmw operation that does not modify the value in memory
; (such as atomic add 0) can be replaced by an mfence followed by a mov.
; This is explained (with the motivation for such an optimization) in
; http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf

define i8 @add8(i8* %p) {
; CHECK-LABEL: add8
; CHECK: mfence
; CHECK: movb
%1 = atomicrmw add i8* %p, i8 0 monotonic
ret i8 %1
}

define i16 @or16(i16* %p) {
; CHECK-LABEL: or16
; CHECK: mfence
; CHECK: movw
%1 = atomicrmw or i16* %p, i16 0 acquire
ret i16 %1
}

define i32 @xor32(i32* %p) {
; CHECK-LABEL: xor32
; CHECK: mfence
; CHECK: movl
%1 = atomicrmw xor i32* %p, i32 0 release
ret i32 %1
}

define i64 @sub64(i64* %p) {
; CHECK-LABEL: sub64
; X64: mfence
; X64: movq
; X32-NOT: mfence
%1 = atomicrmw sub i64* %p, i64 0 seq_cst
ret i64 %1
}

define i128 @or128(i128* %p) {
; CHECK-LABEL: or128
; CHECK-NOT: mfence
%1 = atomicrmw or i128* %p, i128 0 monotonic
ret i128 %1
}

; For 'and', the idempotent value is (-1)
define i32 @and32 (i32* %p) {
; CHECK-LABEL: and32
; CHECK: mfence
; CHECK: movl
%1 = atomicrmw and i32* %p, i32 -1 acq_rel
ret i32 %1
}

0 comments on commit 810739d

Please sign in to comment.