Skip to content

Commit

Permalink
[EarlyCSE] Value forwarding for unordered atomics
Browse files Browse the repository at this point in the history
This patch teaches the fully redundant load part of EarlyCSE how to forward from atomic and volatile loads and stores, and how to eliminate unordered atomics (only). This patch does not include dead store elimination support for unordered atomics, that will follow in the near future.

The basic idea is that we allow all loads and stores to be tracked by the AvailableLoad table. We store a bit in the table which tracks whether load/store was atomic, and then only replace atomic loads with ones which were also atomic.

No attempt is made to refine our handling of ordered loads or stores. Those are still treated as full fences. We could pretty easily extend the release fence handling to release stores, but that should be a separate patch.

Differential Revision: http://reviews.llvm.org/D15337

llvm-svn: 255054
  • Loading branch information
preames committed Dec 8, 2015
1 parent 0aea1b8 commit 8fc2cbf
Show file tree
Hide file tree
Showing 2 changed files with 204 additions and 19 deletions.
96 changes: 77 additions & 19 deletions llvm/lib/Transforms/Scalar/EarlyCSE.cpp
Expand Up @@ -281,21 +281,31 @@ class EarlyCSE {
/// that dominated values can succeed in their lookup.
ScopedHTType AvailableValues;

/// \brief A scoped hash table of the current values of loads.
/// A scoped hash table of the current values of previously encounted memory
/// locations.
///
/// This allows us to get efficient access to dominating loads when we have
/// a fully redundant load. In addition to the most recent load, we keep
/// track of a generation count of the read, which is compared against the
/// current generation count. The current generation count is incremented
/// This allows us to get efficient access to dominating loads or stores when
/// we have a fully redundant load. In addition to the most recent load, we
/// keep track of a generation count of the read, which is compared against
/// the current generation count. The current generation count is incremented
/// after every possibly writing memory operation, which ensures that we only
/// CSE loads with other loads that have no intervening store.
/// CSE loads with other loads that have no intervening store. Ordering
/// events (such as fences or atomic instructions) increment the generation
/// count as well; essentially, we model these as writes to all possible
/// locations. Note that atomic and/or volatile loads and stores can be
/// present the table; it is the responsibility of the consumer to inspect
/// the atomicity/volatility if needed.
struct LoadValue {
Value *Data;
unsigned Generation;
int MatchingId;
LoadValue() : Data(nullptr), Generation(0), MatchingId(-1) {}
LoadValue(Value *Data, unsigned Generation, unsigned MatchingId)
: Data(Data), Generation(Generation), MatchingId(MatchingId) {}
bool IsAtomic;
LoadValue()
: Data(nullptr), Generation(0), MatchingId(-1), IsAtomic(false) {}
LoadValue(Value *Data, unsigned Generation, unsigned MatchingId,
bool IsAtomic)
: Data(Data), Generation(Generation), MatchingId(MatchingId),
IsAtomic(IsAtomic) {}
};
typedef RecyclingAllocator<BumpPtrAllocator,
ScopedHashTableVal<Value *, LoadValue>>
Expand Down Expand Up @@ -410,6 +420,42 @@ class EarlyCSE {
}
return Inst->isAtomic();
}
bool isAtomic() const {
if (IsTargetMemInst) {
assert(Info.IsSimple && "need to refine IsSimple in TTI");
return false;
}
return Inst->isAtomic();
}
bool isUnordered() const {
if (IsTargetMemInst) {
assert(Info.IsSimple && "need to refine IsSimple in TTI");
return true;
}
if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
return LI->isUnordered();
} else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
return SI->isUnordered();
}
// Conservative answer
return !Inst->isAtomic();
}

bool isVolatile() const {
if (IsTargetMemInst) {
assert(Info.IsSimple && "need to refine IsSimple in TTI");
return false;
}
if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
return LI->isVolatile();
} else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
return SI->isVolatile();
}
// Conservative answer
return true;
}


bool isMatchingMemLoc(const ParseMemoryInst &Inst) const {
return (getPointerOperand() == Inst.getPointerOperand() &&
getMatchingId() == Inst.getMatchingId());
Expand Down Expand Up @@ -561,20 +607,22 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
ParseMemoryInst MemInst(Inst, TTI);
// If this is a non-volatile load, process it.
if (MemInst.isValid() && MemInst.isLoad()) {
// Ignore volatile or ordered loads.
if (!MemInst.isSimple()) {
// (conservatively) we can't peak past the ordering implied by this
// operation, but we can add this load to our set of available values
if (MemInst.isVolatile() || !MemInst.isUnordered()) {
LastStore = nullptr;
// Don't CSE across synchronization boundaries.
if (Inst->mayWriteToMemory())
++CurrentGeneration;
continue;
++CurrentGeneration;
}

// If we have an available version of this load, and if it is the right
// generation, replace this instruction.
LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
if (InVal.Data != nullptr && InVal.Generation == CurrentGeneration &&
InVal.MatchingId == MemInst.getMatchingId()) {
InVal.MatchingId == MemInst.getMatchingId() &&
// We don't yet handle removing loads with ordering of any kind.
!MemInst.isVolatile() && MemInst.isUnordered() &&
// We can't replace an atomic load with one which isn't also atomic.
InVal.IsAtomic >= MemInst.isAtomic()) {
Value *Op = getOrCreateResult(InVal.Data, Inst->getType());
if (Op != nullptr) {
DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
Expand All @@ -591,7 +639,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
// Otherwise, remember that we have this instruction.
AvailableLoads.insert(
MemInst.getPointerOperand(),
LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId()));
LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
MemInst.isAtomic()));
LastStore = nullptr;
continue;
}
Expand Down Expand Up @@ -646,9 +695,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {

if (MemInst.isValid() && MemInst.isStore()) {
// We do a trivial form of DSE if there are two stores to the same
// location with no intervening loads. Delete the earlier store.
// location with no intervening loads. Delete the earlier store. Note
// that we can delete an earlier simple store even if the following one
// is ordered/volatile/atomic store.
if (LastStore) {
ParseMemoryInst LastStoreMemInst(LastStore, TTI);
assert(LastStoreMemInst.isSimple() && "Violated invariant");
if (LastStoreMemInst.isMatchingMemLoc(MemInst)) {
DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
<< " due to: " << *Inst << '\n');
Expand All @@ -667,11 +719,17 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
// the store.
AvailableLoads.insert(
MemInst.getPointerOperand(),
LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId()));
LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
MemInst.isAtomic()));

// Remember that this was the last normal store we saw for DSE.
// Note that we can't delete an earlier atomic or volatile store in
// favor of a later one which isn't. We could in principal remove an
// earlier unordered store if the later one is also unordered.
if (MemInst.isSimple())
LastStore = Inst;
else
LastStore = nullptr;
}
}
}
Expand Down
127 changes: 127 additions & 0 deletions llvm/test/Transforms/EarlyCSE/atomics.ll
@@ -0,0 +1,127 @@
; RUN: opt < %s -S -early-cse | FileCheck %s

; CHECK-LABEL: @test12(
define i32 @test12(i1 %B, i32* %P1, i32* %P2) {
%load0 = load i32, i32* %P1
%1 = load atomic i32, i32* %P2 seq_cst, align 4
%load1 = load i32, i32* %P1
%sel = select i1 %B, i32 %load0, i32 %load1
ret i32 %sel
; CHECK: load i32, i32* %P1
; CHECK: load i32, i32* %P1
}

; CHECK-LABEL: @test13(
; atomic to non-atomic forwarding is legal
define i32 @test13(i1 %B, i32* %P1) {
%a = load atomic i32, i32* %P1 seq_cst, align 4
%b = load i32, i32* %P1
%res = sub i32 %a, %b
ret i32 %res
; CHECK: load atomic i32, i32* %P1
; CHECK: ret i32 0
}

; CHECK-LABEL: @test14(
; atomic to unordered atomic forwarding is legal
define i32 @test14(i1 %B, i32* %P1) {
%a = load atomic i32, i32* %P1 seq_cst, align 4
%b = load atomic i32, i32* %P1 unordered, align 4
%res = sub i32 %a, %b
ret i32 %res
; CHECK: load atomic i32, i32* %P1
; CHECK: ret i32 0
}

; CHECK-LABEL: @test15(
; implementation restiction: can't forward to stonger
; than unordered
define i32 @test15(i1 %B, i32* %P1, i32* %P2) {
%a = load atomic i32, i32* %P1 seq_cst, align 4
%b = load atomic i32, i32* %P1 seq_cst, align 4
%res = sub i32 %a, %b
ret i32 %res
; CHECK: load atomic i32, i32* %P1
; CHECK: load atomic i32, i32* %P1
}

; CHECK-LABEL: @test16(
; forwarding non-atomic to atomic is wrong! (However,
; it would be legal to use the later value in place of the
; former in this particular example. We just don't
; do that right now.)
define i32 @test16(i1 %B, i32* %P1, i32* %P2) {
%a = load i32, i32* %P1, align 4
%b = load atomic i32, i32* %P1 unordered, align 4
%res = sub i32 %a, %b
ret i32 %res
; CHECK: load i32, i32* %P1
; CHECK: load atomic i32, i32* %P1
}

; Can't DSE across a full fence
define void @test17(i1 %B, i32* %P1, i32* %P2) {
; CHECK-LABEL: @test17
; CHECK: store
; CHECK: store atomic
; CHECK: store
store i32 0, i32* %P1, align 4
store atomic i32 0, i32* %P2 seq_cst, align 4
store i32 0, i32* %P1, align 4
ret void
}

; Can't remove a volatile load
define i32 @test18(i1 %B, i32* %P1, i32* %P2) {
%a = load i32, i32* %P1, align 4
%b = load volatile i32, i32* %P1, align 4
%res = sub i32 %a, %b
ret i32 %res
; CHECK-LABEL: @test18
; CHECK: load i32, i32* %P1
; CHECK: load volatile i32, i32* %P1
}

; Can't DSE a volatile store
define void @test19(i1 %B, i32* %P1, i32* %P2) {
; CHECK-LABEL: @test19
; CHECK: store volatile
; CHECK: store
store volatile i32 0, i32* %P1, align 4
store i32 3, i32* %P1, align 4
ret void
}

; Can value forward from volailes
define i32 @test20(i1 %B, i32* %P1, i32* %P2) {
%a = load volatile i32, i32* %P1, align 4
%b = load i32, i32* %P1, align 4
%res = sub i32 %a, %b
ret i32 %res
; CHECK-LABEL: @test20
; CHECK: load volatile i32, i32* %P1
; CHECK: ret i32 0
}

; Can DSE a non-volatile store in favor of a volatile one
; currently a missed optimization
define void @test21(i1 %B, i32* %P1, i32* %P2) {
; CHECK-LABEL: @test21
; CHECK: store
; CHECK: store volatile
store i32 0, i32* %P1, align 4
store volatile i32 3, i32* %P1, align 4
ret void
}

; Can DSE a normal store in favor of a unordered one
define void @test22(i1 %B, i32* %P1, i32* %P2) {
; CHECK-LABEL: @test22
; CHECK-NEXT: store atomic
store i32 0, i32* %P1, align 4
store atomic i32 3, i32* %P1 unordered, align 4
ret void
}



0 comments on commit 8fc2cbf

Please sign in to comment.