Skip to content

Commit

Permalink
[MCA] Fixed a bug where loads and stores were sometimes incorrectly m…
Browse files Browse the repository at this point in the history
…arked as depedent. Fixes PR45793.

This fixes a regression introduced by a very old commit 280ac1f (was
llvm-svn 361950).

Commit 280ac1f redesigned the logic in the LSUnit with the goal of
speeding up isReady() queries, and stabilising the LSUnit API (while also making
the load store unit more customisable).

The concept of MemoryGroup (effectively an alias set) was added by that commit
to better describe and track dependencies between memory operations.  However,
that concept was not just used for alias dependencies, but it was also used for
describing memory "order" dependencies (enforced by the memory consistency
model).

Instructions of a same memory group were considered "equivalent" as in:
independent operations that can potentially execute in parallel.  The problem
was that the cost of a dependency (in terms of number of cycles) should have
been different for "order" dependency. Instructions in an order dependency
simply have to have to wait until their predecessors are "issued" to an
underlying pipeline (rather than having to wait until predecessors have beeng
fully executed). For simple "order" dependencies, this was effectively
introducing an artificial delay on the "issue" of independent loads and stores.

This patch fixes the issue and adds a new test named 'independent-load-stores.s'
to a bunch of x86 targets. That test contains the reproducible posted by Fabian
Ritter on PR45793.

I had to rerun the update-mca-tests script on several files. To avoid expected
regressions on some Exynos tests, I have added a -noalias=false flag (to match
the old strict behavior on latencies).

Some tests for processor Barcelona are improved/fixed by this change and they
now show better results.  In a few tests we were incorrectly counting the time
spent by instructions in a scheduler queue.  In one case in particular we now
correctly see a store executed out of order.  That test was affected by the same
underlying issue reported as PR45793.

Reviewers: mattd

Differential Revision: https://reviews.llvm.org/D79351
  • Loading branch information
adibiagio committed May 5, 2020
1 parent 08032e7 commit 5578ec3
Show file tree
Hide file tree
Showing 18 changed files with 974 additions and 339 deletions.
49 changes: 37 additions & 12 deletions llvm/include/llvm/MCA/HardwareUnits/LSUnit.h
Expand Up @@ -40,7 +40,10 @@ class MemoryGroup {
unsigned NumInstructions;
unsigned NumExecuting;
unsigned NumExecuted;
SmallVector<MemoryGroup *, 4> Succ;
// Successors that are in a order dependency with this group.
SmallVector<MemoryGroup *, 4> OrderSucc;
// Successors that are in a data dependency with this group.
SmallVector<MemoryGroup *, 4> DataSucc;

CriticalDependency CriticalPredecessor;
InstRef CriticalMemoryInstruction;
Expand All @@ -55,8 +58,9 @@ class MemoryGroup {
NumExecuted(0), CriticalPredecessor(), CriticalMemoryInstruction() {}
MemoryGroup(MemoryGroup &&) = default;

ArrayRef<MemoryGroup *> getSuccessors() const { return Succ; }
unsigned getNumSuccessors() const { return Succ.size(); }
size_t getNumSuccessors() const {
return OrderSucc.size() + DataSucc.size();
}
unsigned getNumPredecessors() const { return NumPredecessors; }
unsigned getNumExecutingPredecessors() const {
return NumExecutingPredecessors;
Expand All @@ -75,12 +79,22 @@ class MemoryGroup {
return CriticalPredecessor;
}

void addSuccessor(MemoryGroup *Group) {
void addSuccessor(MemoryGroup *Group, bool IsDataDependent) {
// Do not need to add a dependency if there is no data
// dependency and all instructions from this group have been
// issued already.
if (!IsDataDependent && isExecuting())
return;

Group->NumPredecessors++;
assert(!isExecuted() && "Should have been removed!");
if (isExecuting())
Group->onGroupIssued(CriticalMemoryInstruction);
Succ.emplace_back(Group);
Group->onGroupIssued(CriticalMemoryInstruction, IsDataDependent);

if (IsDataDependent)
DataSucc.emplace_back(Group);
else
OrderSucc.emplace_back(Group);
}

bool isWaiting() const {
Expand All @@ -98,10 +112,13 @@ class MemoryGroup {
}
bool isExecuted() const { return NumInstructions == NumExecuted; }

void onGroupIssued(const InstRef &IR) {
void onGroupIssued(const InstRef &IR, bool ShouldUpdateCriticalDep) {
assert(!isReady() && "Unexpected group-start event!");
NumExecutingPredecessors++;

if (!ShouldUpdateCriticalDep)
return;

unsigned Cycles = IR.getInstruction()->getCyclesLeft();
if (CriticalPredecessor.Cycles < Cycles) {
CriticalPredecessor.IID = IR.getSourceIndex();
Expand Down Expand Up @@ -133,8 +150,14 @@ class MemoryGroup {
return;

// Notify successors that this group started execution.
for (MemoryGroup *MG : Succ)
MG->onGroupIssued(CriticalMemoryInstruction);
for (MemoryGroup *MG : OrderSucc) {
MG->onGroupIssued(CriticalMemoryInstruction, false);
// Release the order dependency with this group.
MG->onGroupExecuted();
}

for (MemoryGroup *MG : DataSucc)
MG->onGroupIssued(CriticalMemoryInstruction, true);
}

void onInstructionExecuted() {
Expand All @@ -145,8 +168,8 @@ class MemoryGroup {
if (!isExecuted())
return;

// Notify successors that this group has finished execution.
for (MemoryGroup *MG : Succ)
// Notify data dependent successors that this group has finished execution.
for (MemoryGroup *MG : DataSucc)
MG->onGroupExecuted();
}

Expand Down Expand Up @@ -412,6 +435,7 @@ class LSUnit : public LSUnitBase {
unsigned CurrentLoadGroupID;
unsigned CurrentLoadBarrierGroupID;
unsigned CurrentStoreGroupID;
unsigned CurrentStoreBarrierGroupID;

public:
LSUnit(const MCSchedModel &SM)
Expand All @@ -420,7 +444,8 @@ class LSUnit : public LSUnitBase {
: LSUnit(SM, LQ, SQ, /* NoAlias */ false) {}
LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ, bool AssumeNoAlias)
: LSUnitBase(SM, LQ, SQ, AssumeNoAlias), CurrentLoadGroupID(0),
CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0) {}
CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0),
CurrentStoreBarrierGroupID(0) {}

/// Returns LSU_AVAILABLE if there are enough load/store queue entries to
/// accomodate instruction IR.
Expand Down
84 changes: 63 additions & 21 deletions llvm/lib/MCA/HardwareUnits/LSUnit.cpp
Expand Up @@ -77,9 +77,6 @@ unsigned LSUnit::dispatch(const InstRef &IR) {
acquireSQSlot();

if (Desc.MayStore) {
// Always create a new group for store operations.

// A store may not pass a previous store or store barrier.
unsigned NewGID = createMemoryGroup();
MemoryGroup &NewGroup = getGroup(NewGID);
NewGroup.addInstruction();
Expand All @@ -91,16 +88,32 @@ unsigned LSUnit::dispatch(const InstRef &IR) {
MemoryGroup &IDom = getGroup(ImmediateLoadDominator);
LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: (" << ImmediateLoadDominator
<< ") --> (" << NewGID << ")\n");
IDom.addSuccessor(&NewGroup);
IDom.addSuccessor(&NewGroup, !assumeNoAlias());
}

// A store may not pass a previous store barrier.
if (CurrentStoreBarrierGroupID) {
MemoryGroup &StoreGroup = getGroup(CurrentStoreBarrierGroupID);
LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: ("
<< CurrentStoreBarrierGroupID
<< ") --> (" << NewGID << ")\n");
StoreGroup.addSuccessor(&NewGroup, true);
}
if (CurrentStoreGroupID) {

// A store may not pass a previous store.
if (CurrentStoreGroupID &&
(CurrentStoreGroupID != CurrentStoreBarrierGroupID)) {
MemoryGroup &StoreGroup = getGroup(CurrentStoreGroupID);
LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: (" << CurrentStoreGroupID
<< ") --> (" << NewGID << ")\n");
StoreGroup.addSuccessor(&NewGroup);
StoreGroup.addSuccessor(&NewGroup, !assumeNoAlias());
}


CurrentStoreGroupID = NewGID;
if (IsMemBarrier)
CurrentStoreBarrierGroupID = NewGID;

if (Desc.MayLoad) {
CurrentLoadGroupID = NewGID;
if (IsMemBarrier)
Expand All @@ -112,31 +125,59 @@ unsigned LSUnit::dispatch(const InstRef &IR) {

assert(Desc.MayLoad && "Expected a load!");

// Always create a new memory group if this is the first load of the sequence.
unsigned ImmediateLoadDominator =
std::max(CurrentLoadGroupID, CurrentLoadBarrierGroupID);

// A new load group is created if we are in one of the following situations:
// 1) This is a load barrier (by construction, a load barrier is always
// assigned to a different memory group).
// 2) There is no load in flight (by construction we always keep loads and
// stores into separate memory groups).
// 3) There is a load barrier in flight. This load depends on it.
// 4) There is an intervening store between the last load dispatched to the
// LSU and this load. We always create a new group even if this load
// does not alias the last dispatched store.
// 5) There is no intervening store and there is an active load group.
// However that group has already started execution, so we cannot add
// this load to it.
bool ShouldCreateANewGroup =
IsMemBarrier || !ImmediateLoadDominator ||
CurrentLoadBarrierGroupID == ImmediateLoadDominator ||
ImmediateLoadDominator <= CurrentStoreGroupID ||
getGroup(ImmediateLoadDominator).isExecuting();

// A load may not pass a previous store unless flag 'NoAlias' is set.
// A load may pass a previous load.
// A younger load cannot pass a older load barrier.
// A load barrier cannot pass a older load.
bool ShouldCreateANewGroup = !CurrentLoadGroupID || IsMemBarrier ||
CurrentLoadGroupID <= CurrentStoreGroupID ||
CurrentLoadGroupID <= CurrentLoadBarrierGroupID;
if (ShouldCreateANewGroup) {
unsigned NewGID = createMemoryGroup();
MemoryGroup &NewGroup = getGroup(NewGID);
NewGroup.addInstruction();

// A load may not pass a previous store or store barrier
// unless flag 'NoAlias' is set.
if (!assumeNoAlias() && CurrentStoreGroupID) {
MemoryGroup &StGroup = getGroup(CurrentStoreGroupID);
MemoryGroup &StoreGroup = getGroup(CurrentStoreGroupID);
LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: (" << CurrentStoreGroupID
<< ") --> (" << NewGID << ")\n");
StGroup.addSuccessor(&NewGroup);
StoreGroup.addSuccessor(&NewGroup, true);
}
if (CurrentLoadBarrierGroupID) {
MemoryGroup &LdGroup = getGroup(CurrentLoadBarrierGroupID);
LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: (" << CurrentLoadBarrierGroupID
<< ") --> (" << NewGID << ")\n");
LdGroup.addSuccessor(&NewGroup);

// A load barrier may not pass a previous load or load barrier.
if (IsMemBarrier) {
if (ImmediateLoadDominator) {
MemoryGroup &LoadGroup = getGroup(ImmediateLoadDominator);
LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: ("
<< ImmediateLoadDominator
<< ") --> (" << NewGID << ")\n");
LoadGroup.addSuccessor(&NewGroup, true);
}
} else {
// A younger load cannot pass a older load barrier.
if (CurrentLoadBarrierGroupID) {
MemoryGroup &LoadGroup = getGroup(CurrentLoadBarrierGroupID);
LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: ("
<< CurrentLoadBarrierGroupID
<< ") --> (" << NewGID << ")\n");
LoadGroup.addSuccessor(&NewGroup, true);
}
}

CurrentLoadGroupID = NewGID;
Expand All @@ -145,6 +186,7 @@ unsigned LSUnit::dispatch(const InstRef &IR) {
return NewGID;
}

// A load may pass a previous load.
MemoryGroup &Group = getGroup(CurrentLoadGroupID);
Group.addInstruction();
return CurrentLoadGroupID;
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/tools/llvm-mca/AArch64/Exynos/asimd-st1.s
@@ -1,7 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M3
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m4 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M4
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m5 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M5

st1 {v0.s}[0], [sp]
st1 {v0.2s}, [sp]
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/tools/llvm-mca/AArch64/Exynos/asimd-st2.s
@@ -1,7 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M3
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m4 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M4
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m5 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M5

st2 {v0.s, v1.s}[0], [sp]
st2 {v0.2s, v1.2s}, [sp]
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/tools/llvm-mca/AArch64/Exynos/asimd-st3.s
@@ -1,7 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M3
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m4 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M4
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m5 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M5

st3 {v0.s, v1.s, v2.s}[0], [sp]
st3 {v0.2s, v1.2s, v2.2s}, [sp]
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/tools/llvm-mca/AArch64/Exynos/asimd-st4.s
@@ -1,7 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M3
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m4 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M4
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m5 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M5

st4 {v0.s, v1.s, v2.s, v3.s}[0], [sp]
st4 {v0.2s, v1.2s, v2.2s, v3.2s}, [sp]
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/tools/llvm-mca/AArch64/Exynos/float-store.s
@@ -1,7 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3
# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4
# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5
# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M3
# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M4
# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M5

stur d0, [sp, #2]
stur q0, [sp, #16]
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/tools/llvm-mca/AArch64/Exynos/store.s
@@ -1,7 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3
# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4
# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5
# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M3
# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M4
# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M5

stur x0, [sp, #8]
strb w0, [sp], #1
Expand Down

0 comments on commit 5578ec3

Please sign in to comment.