From 810739d17420d914b1a6f239f2ca7d5d82bd5f66 Mon Sep 17 00:00:00 2001
From: Robin Morisset <morisset@google.com>
Date: Thu, 25 Sep 2014 17:27:43 +0000
Subject: [PATCH] Lower idempotent RMWs to fence+load

Summary:
I originally tried doing this specifically for X86 in the backend in D5091,
but it was rather brittle and generally running too late to be general.
Furthermore, other targets may want to implement similar optimizations.
So I reimplemented it at the IR-level, fitting it into AtomicExpandPass
as it interacts with that pass (which could not be cleanly done before
at the backend level).

This optimization relies on a new target hook, which is only used by X86
for now, as the correctness of the optimization on other targets remains
an open question. If it is found correct on other targets, it should be
trivial to enable for them.

Details of the optimization are discussed in D5091.

Test Plan: make check-all + a new test

Reviewers: jfb

Subscribers: llvm-commits

Differential Revision: http://reviews.llvm.org/D5422

llvm-svn: 218455
---
 llvm/include/llvm/Target/TargetLowering.h  | 14 +++++
 llvm/lib/CodeGen/AtomicExpandPass.cpp      | 45 ++++++++++++-
 llvm/lib/Target/X86/X86ISelLowering.cpp    | 73 ++++++++++++++++++++--
 llvm/lib/Target/X86/X86ISelLowering.h      |  3 +
 llvm/test/CodeGen/X86/atomic_idempotent.ll | 56 +++++++++++++++++
 5 files changed, 185 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/atomic_idempotent.ll

diff --git a/llvm/include/llvm/Target/TargetLowering.h b/llvm/include/llvm/Target/TargetLowering.h
index 97318d7f89cf0..ad5fc5d848bf3 100644
--- a/llvm/include/llvm/Target/TargetLowering.h
+++ b/llvm/include/llvm/Target/TargetLowering.h
@@ -1008,6 +1008,20 @@ class TargetLoweringBase {
     return false;
   }
 
+  /// On some platforms, an AtomicRMW that never actually modifies the value
+  /// (such as fetch_add of 0) can be turned into a fence followed by an
+  /// atomic load. This may sound useless, but it makes it possible for the
+  /// processor to keep the cacheline shared, dramatically improving
+  /// performance. And such idempotent RMWs are useful for implementing some
+  /// kinds of locks, see for example (justification + benchmarks):
+  /// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf
+  /// This method tries doing that transformation, returning the atomic load if
+  /// it succeeds, and nullptr otherwise.
+  /// If shouldExpandAtomicLoadInIR returns true on that load, it will undergo
+  /// another round of expansion.
+  virtual LoadInst *lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *RMWI) const {
+    return nullptr;
+  }
   //===--------------------------------------------------------------------===//
   // TargetLowering Configuration Methods - These methods should be invoked by
   // the derived class constructor to configure this object for the target.
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index dd532294f44c6..12f6bd77d334d 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -51,6 +51,8 @@ namespace {
     bool expandAtomicRMWToLLSC(AtomicRMWInst *AI);
     bool expandAtomicRMWToCmpXchg(AtomicRMWInst *AI);
     bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
+    bool isIdempotentRMW(AtomicRMWInst *AI);
+    bool simplifyIdempotentRMW(AtomicRMWInst *AI);
   };
 }
 
@@ -127,8 +129,15 @@ bool AtomicExpand::runOnFunction(Function &F) {
       MadeChange |= expandAtomicLoad(LI);
     } else if (SI && TargetLowering->shouldExpandAtomicStoreInIR(SI)) {
       MadeChange |= expandAtomicStore(SI);
-    } else if (RMWI && TargetLowering->shouldExpandAtomicRMWInIR(RMWI)) {
-      MadeChange |= expandAtomicRMW(RMWI);
+    } else if (RMWI) {
+      // There are two different ways of expanding RMW instructions:
+      // - into a load if it is idempotent
+      // - into a Cmpxchg/LL-SC loop otherwise
+      // we try them in that order.
+      MadeChange |= (isIdempotentRMW(RMWI) &&
+                        simplifyIdempotentRMW(RMWI)) ||
+                    (TargetLowering->shouldExpandAtomicRMWInIR(RMWI) &&
+                        expandAtomicRMW(RMWI));
     } else if (CASI && TargetLowering->hasLoadLinkedStoreConditional()) {
       MadeChange |= expandAtomicCmpXchg(CASI);
     }
@@ -520,3 +529,35 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   CI->eraseFromParent();
   return true;
 }
+
+bool AtomicExpand::isIdempotentRMW(AtomicRMWInst* RMWI) {
+  auto C = dyn_cast<ConstantInt>(RMWI->getValOperand());
+  if(!C)
+    return false;
+
+  AtomicRMWInst::BinOp Op = RMWI->getOperation();
+  switch(Op) {
+    case AtomicRMWInst::Add:
+    case AtomicRMWInst::Sub:
+    case AtomicRMWInst::Or:
+    case AtomicRMWInst::Xor:
+      return C->isZero();
+    case AtomicRMWInst::And:
+      return C->isMinusOne();
+    // FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/...
+    default:
+      return false;
+  }
+}
+
+bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) {
+  auto TLI = TM->getSubtargetImpl()->getTargetLowering();
+
+  if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) {
+    if (TLI->shouldExpandAtomicLoadInIR(ResultingLoad))
+      expandAtomicLoad(ResultingLoad);
+    return true;
+  }
+
+  return false;
+}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 44cd07b72d14d..091168336b08c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -17802,6 +17802,74 @@ bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   }
 }
 
+static bool hasMFENCE(const X86Subtarget& Subtarget) {
+  // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
+  // no-sse2). There isn't any reason to disable it if the target processor
+  // supports it.
+  return Subtarget.hasSSE2() || Subtarget.is64Bit();
+}
+
+LoadInst *
+X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
+  const X86Subtarget &Subtarget =
+      getTargetMachine().getSubtarget<X86Subtarget>();
+  unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
+  const Type *MemType = AI->getType();
+  // Accesses larger than the native width are turned into cmpxchg/libcalls, so
+  // there is no benefit in turning such RMWs into loads, and it is actually
+  // harmful as it introduces a mfence.
+  if (MemType->getPrimitiveSizeInBits() > NativeWidth)
+    return nullptr;
+
+  auto Builder = IRBuilder<>(AI);
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  auto SynchScope = AI->getSynchScope();
+  // We must restrict the ordering to avoid generating loads with Release or
+  // ReleaseAcquire orderings.
+  auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
+  auto Ptr = AI->getPointerOperand();
+
+  // Before the load we need a fence. Here is an example lifted from
+  // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
+  // is required:
+  // Thread 0:
+  //   x.store(1, relaxed);
+  //   r1 = y.fetch_add(0, release);
+  // Thread 1:
+  //   y.fetch_add(42, acquire);
+  //   r2 = x.load(relaxed);
+  // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
+  // lowered to just a load without a fence. A mfence flushes the store buffer,
+  // making the optimization clearly correct.
+  // FIXME: it is required if isAtLeastRelease(Order) but it is not clear
+  // otherwise, we might be able to be more agressive on relaxed idempotent
+  // rmw. In practice, they do not look useful, so we don't try to be
+  // especially clever.
+  if (SynchScope == SingleThread) {
+    // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
+    // the IR level, so we must wrap it in an intrinsic.
+    return nullptr;
+  } else if (hasMFENCE(Subtarget)) {
+    Function *MFence = llvm::Intrinsic::getDeclaration(M,
+            Intrinsic::x86_sse2_mfence);
+    Builder.CreateCall(MFence);
+  } else {
+    // FIXME: it might make sense to use a locked operation here but on a
+    // different cache-line to prevent cache-line bouncing. In practice it
+    // is probably a small win, and x86 processors without mfence are rare
+    // enough that we do not bother.
+    return nullptr;
+  }
+
+  // Finally we can emit the atomic load.
+  LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
+          AI->getType()->getPrimitiveSizeInBits());
+  Loaded->setAtomic(Order, SynchScope);
+  AI->replaceAllUsesWith(Loaded);
+  AI->eraseFromParent();
+  return Loaded;
+}
+
 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
                                  SelectionDAG &DAG) {
   SDLoc dl(Op);
@@ -17813,10 +17881,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
   // The only fence that needs an instruction is a sequentially-consistent
   // cross-thread fence.
   if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
-    // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
-    // no-sse2). There isn't any reason to disable it if the target processor
-    // supports it.
-    if (Subtarget->hasSSE2() || Subtarget->is64Bit())
+    if (hasMFENCE(*Subtarget))
       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
 
     SDValue Chain = Op.getOperand(0);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 0e6011cdbd9fb..2d4727bc48942 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -965,6 +965,9 @@ namespace llvm {
     bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
     bool shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
 
+    LoadInst *
+    lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
+
     bool needsCmpXchgNb(const Type *MemType) const;
 
     /// Utility function to emit atomic-load-arith operations (and, or, xor,
diff --git a/llvm/test/CodeGen/X86/atomic_idempotent.ll b/llvm/test/CodeGen/X86/atomic_idempotent.ll
new file mode 100644
index 0000000000000..1afc535133d6d
--- /dev/null
+++ b/llvm/test/CodeGen/X86/atomic_idempotent.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X64
+; RUN: llc < %s -march=x86 -mattr=+sse2 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X32
+
+; On x86, an atomic rmw operation that does not modify the value in memory
+; (such as atomic add 0) can be replaced by an mfence followed by a mov.
+; This is explained (with the motivation for such an optimization) in
+; http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf
+
+define i8 @add8(i8* %p) {
+; CHECK-LABEL: add8
+; CHECK: mfence
+; CHECK: movb
+  %1 = atomicrmw add i8* %p, i8 0 monotonic
+  ret i8 %1
+}
+
+define i16 @or16(i16* %p) {
+; CHECK-LABEL: or16
+; CHECK: mfence
+; CHECK: movw
+  %1 = atomicrmw or i16* %p, i16 0 acquire
+  ret i16 %1
+}
+
+define i32 @xor32(i32* %p) {
+; CHECK-LABEL: xor32
+; CHECK: mfence
+; CHECK: movl
+  %1 = atomicrmw xor i32* %p, i32 0 release
+  ret i32 %1
+}
+
+define i64 @sub64(i64* %p) {
+; CHECK-LABEL: sub64
+; X64: mfence
+; X64: movq
+; X32-NOT: mfence
+  %1 = atomicrmw sub i64* %p, i64 0 seq_cst
+  ret i64 %1
+}
+
+define i128 @or128(i128* %p) {
+; CHECK-LABEL: or128
+; CHECK-NOT: mfence
+  %1 = atomicrmw or i128* %p, i128 0 monotonic
+  ret i128 %1
+}
+
+; For 'and', the idempotent value is (-1)
+define i32 @and32 (i32* %p) {
+; CHECK-LABEL: and32
+; CHECK: mfence
+; CHECK: movl
+  %1 = atomicrmw and i32* %p, i32 -1 acq_rel
+  ret i32 %1
+}