72 changes: 72 additions & 0 deletions llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-2.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=100 -timeline -timeline-max-iterations=1 -bottleneck-analysis < %s | FileCheck %s

vhaddps %xmm0, %xmm0, %xmm1

# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 100
# CHECK-NEXT: Total Cycles: 106
# CHECK-NEXT: Total uOps: 100

# CHECK: Dispatch Width: 2
# CHECK-NEXT: uOps Per Cycle: 0.94
# CHECK-NEXT: IPC: 0.94
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK: Cycles with backend pressure increase [ 76.42% ]
# CHECK-NEXT: Throughput Bottlenecks:
# CHECK-NEXT: Resource Pressure [ 76.42% ]
# CHECK-NEXT: - JFPA [ 76.42% ]
# CHECK-NEXT: - JFPU0 [ 76.42% ]
# CHECK-NEXT: Data Dependencies: [ 0.00% ]
# CHECK-NEXT: - Register Dependencies [ 0.00% ]
# CHECK-NEXT: - Memory Dependencies [ 0.00% ]

# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
# CHECK-NEXT: [3]: RThroughput
# CHECK-NEXT: [4]: MayLoad
# CHECK-NEXT: [5]: MayStore
# CHECK-NEXT: [6]: HasSideEffects (U)

# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 4 1.00 vhaddps %xmm0, %xmm0, %xmm1

# CHECK: Resources:
# CHECK-NEXT: [0] - JALU0
# CHECK-NEXT: [1] - JALU1
# CHECK-NEXT: [2] - JDiv
# CHECK-NEXT: [3] - JFPA
# CHECK-NEXT: [4] - JFPM
# CHECK-NEXT: [5] - JFPU0
# CHECK-NEXT: [6] - JFPU1
# CHECK-NEXT: [7] - JLAGU
# CHECK-NEXT: [8] - JMul
# CHECK-NEXT: [9] - JSAGU
# CHECK-NEXT: [10] - JSTC
# CHECK-NEXT: [11] - JVALU0
# CHECK-NEXT: [12] - JVALU1
# CHECK-NEXT: [13] - JVIMUL

# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
# CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - -

# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
# CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - vhaddps %xmm0, %xmm0, %xmm1

# CHECK: Timeline view:
# CHECK-NEXT: Index 0123456

# CHECK: [0,0] DeeeeER vhaddps %xmm0, %xmm0, %xmm1

# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vhaddps %xmm0, %xmm0, %xmm1
106 changes: 106 additions & 0 deletions llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-3.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1500 -noalias=false -timeline -timeline-max-iterations=1 -bottleneck-analysis < %s | FileCheck %s

vmovaps (%rsi), %xmm0
vmovaps %xmm0, (%rdi)
vmovaps 16(%rsi), %xmm0
vmovaps %xmm0, 16(%rdi)
vmovaps 32(%rsi), %xmm0
vmovaps %xmm0, 32(%rdi)
vmovaps 48(%rsi), %xmm0
vmovaps %xmm0, 48(%rdi)

# CHECK: Iterations: 1500
# CHECK-NEXT: Instructions: 12000
# CHECK-NEXT: Total Cycles: 36003
# CHECK-NEXT: Total uOps: 12000

# CHECK: Dispatch Width: 2
# CHECK-NEXT: uOps Per Cycle: 0.33
# CHECK-NEXT: IPC: 0.33
# CHECK-NEXT: Block RThroughput: 4.0

# CHECK: Cycles with backend pressure increase [ 99.89% ]
# CHECK-NEXT: Throughput Bottlenecks:
# CHECK-NEXT: Resource Pressure [ 0.00% ]
# CHECK-NEXT: Data Dependencies: [ 99.89% ]
# CHECK-NEXT: - Register Dependencies [ 0.00% ]
# CHECK-NEXT: - Memory Dependencies [ 99.89% ]

# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
# CHECK-NEXT: [3]: RThroughput
# CHECK-NEXT: [4]: MayLoad
# CHECK-NEXT: [5]: MayStore
# CHECK-NEXT: [6]: HasSideEffects (U)

# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 5 1.00 * vmovaps (%rsi), %xmm0
# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, (%rdi)
# CHECK-NEXT: 1 5 1.00 * vmovaps 16(%rsi), %xmm0
# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 16(%rdi)
# CHECK-NEXT: 1 5 1.00 * vmovaps 32(%rsi), %xmm0
# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 32(%rdi)
# CHECK-NEXT: 1 5 1.00 * vmovaps 48(%rsi), %xmm0
# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 48(%rdi)

# CHECK: Resources:
# CHECK-NEXT: [0] - JALU0
# CHECK-NEXT: [1] - JALU1
# CHECK-NEXT: [2] - JDiv
# CHECK-NEXT: [3] - JFPA
# CHECK-NEXT: [4] - JFPM
# CHECK-NEXT: [5] - JFPU0
# CHECK-NEXT: [6] - JFPU1
# CHECK-NEXT: [7] - JLAGU
# CHECK-NEXT: [8] - JMul
# CHECK-NEXT: [9] - JSAGU
# CHECK-NEXT: [10] - JSTC
# CHECK-NEXT: [11] - JVALU0
# CHECK-NEXT: [12] - JVALU1
# CHECK-NEXT: [13] - JVIMUL

# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
# CHECK-NEXT: - - - 2.00 2.00 4.00 4.00 4.00 - 4.00 4.00 - - -

# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
# CHECK-NEXT: - - - - 1.00 1.00 - 1.00 - - - - - - vmovaps (%rsi), %xmm0
# CHECK-NEXT: - - - - - - 1.00 - - 1.00 1.00 - - - vmovaps %xmm0, (%rdi)
# CHECK-NEXT: - - - 1.00 - 1.00 - 1.00 - - - - - - vmovaps 16(%rsi), %xmm0
# CHECK-NEXT: - - - - - - 1.00 - - 1.00 1.00 - - - vmovaps %xmm0, 16(%rdi)
# CHECK-NEXT: - - - - 1.00 1.00 - 1.00 - - - - - - vmovaps 32(%rsi), %xmm0
# CHECK-NEXT: - - - - - - 1.00 - - 1.00 1.00 - - - vmovaps %xmm0, 32(%rdi)
# CHECK-NEXT: - - - 1.00 - 1.00 - 1.00 - - - - - - vmovaps 48(%rsi), %xmm0
# CHECK-NEXT: - - - - - - 1.00 - - 1.00 1.00 - - - vmovaps %xmm0, 48(%rdi)

# CHECK: Timeline view:
# CHECK-NEXT: 0123456789
# CHECK-NEXT: Index 0123456789 0123456

# CHECK: [0,0] DeeeeeER . . . .. vmovaps (%rsi), %xmm0
# CHECK-NEXT: [0,1] D=====eER . . . .. vmovaps %xmm0, (%rdi)
# CHECK-NEXT: [0,2] .D=====eeeeeER . . .. vmovaps 16(%rsi), %xmm0
# CHECK-NEXT: [0,3] .D==========eER. . .. vmovaps %xmm0, 16(%rdi)
# CHECK-NEXT: [0,4] . D==========eeeeeER. .. vmovaps 32(%rsi), %xmm0
# CHECK-NEXT: [0,5] . D===============eER .. vmovaps %xmm0, 32(%rdi)
# CHECK-NEXT: [0,6] . D===============eeeeeER. vmovaps 48(%rsi), %xmm0
# CHECK-NEXT: [0,7] . D====================eER vmovaps %xmm0, 48(%rdi)

# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vmovaps (%rsi), %xmm0
# CHECK-NEXT: 1. 1 6.0 0.0 0.0 vmovaps %xmm0, (%rdi)
# CHECK-NEXT: 2. 1 6.0 0.0 0.0 vmovaps 16(%rsi), %xmm0
# CHECK-NEXT: 3. 1 11.0 0.0 0.0 vmovaps %xmm0, 16(%rdi)
# CHECK-NEXT: 4. 1 11.0 0.0 0.0 vmovaps 32(%rsi), %xmm0
# CHECK-NEXT: 5. 1 16.0 0.0 0.0 vmovaps %xmm0, 32(%rdi)
# CHECK-NEXT: 6. 1 16.0 0.0 0.0 vmovaps 48(%rsi), %xmm0
# CHECK-NEXT: 7. 1 21.0 0.0 0.0 vmovaps %xmm0, 48(%rdi)
102 changes: 100 additions & 2 deletions llvm/tools/llvm-mca/Views/SummaryView.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,14 @@ namespace mca {
SummaryView::SummaryView(const MCSchedModel &Model, ArrayRef<MCInst> S,
unsigned Width)
: SM(Model), Source(S), DispatchWidth(Width), LastInstructionIdx(0),
TotalCycles(0), NumMicroOps(0),
TotalCycles(0), NumMicroOps(0), BPI({0, 0, 0, 0}),
ResourcePressureDistribution(Model.getNumProcResourceKinds(), 0),
ProcResourceUsage(Model.getNumProcResourceKinds(), 0),
ProcResourceMasks(Model.getNumProcResourceKinds()),
ResIdx2ProcResID(Model.getNumProcResourceKinds(), 0) {
ResIdx2ProcResID(Model.getNumProcResourceKinds(), 0),
PressureIncreasedBecauseOfResources(false),
PressureIncreasedBecauseOfDataDependencies(false),
SeenStallCycles(false) {
computeProcResourceMasks(SM, ProcResourceMasks);
for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
unsigned Index = getResourceStateIndex(ProcResourceMasks[I]);
Expand Down Expand Up @@ -61,6 +65,98 @@ void SummaryView::onEvent(const HWInstructionEvent &Event) {
}
}

void SummaryView::onEvent(const HWPressureEvent &Event) {
assert(Event.Reason != HWPressureEvent::INVALID &&
"Unexpected invalid event!");

switch (Event.Reason) {
default:
break;

case HWPressureEvent::RESOURCES: {
PressureIncreasedBecauseOfResources = true;
++BPI.ResourcePressureCycles;
uint64_t ResourceMask = Event.ResourceMask;
while (ResourceMask) {
uint64_t Current = ResourceMask & (-ResourceMask);
unsigned Index = getResourceStateIndex(Current);
unsigned ProcResID = ResIdx2ProcResID[Index];
const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResID);
if (!PRDesc.SubUnitsIdxBegin) {
ResourcePressureDistribution[Index]++;
ResourceMask ^= Current;
continue;
}

for (unsigned I = 0, E = PRDesc.NumUnits; I < E; ++I) {
unsigned OtherProcResID = PRDesc.SubUnitsIdxBegin[I];
unsigned OtherMask = ProcResourceMasks[OtherProcResID];
ResourcePressureDistribution[getResourceStateIndex(OtherMask)]++;
}

ResourceMask ^= Current;
}
}

break;
case HWPressureEvent::REGISTER_DEPS:
PressureIncreasedBecauseOfDataDependencies = true;
++BPI.RegisterDependencyCycles;
break;
case HWPressureEvent::MEMORY_DEPS:
PressureIncreasedBecauseOfDataDependencies = true;
++BPI.MemoryDependencyCycles;
break;
}
}

void SummaryView::printBottleneckHints(raw_ostream &OS) const {
if (!SeenStallCycles || !BPI.PressureIncreaseCycles)
return;

double PressurePerCycle =
(double)BPI.PressureIncreaseCycles * 100 / TotalCycles;
double ResourcePressurePerCycle =
(double)BPI.ResourcePressureCycles * 100 / TotalCycles;
double DDPerCycle = (double)BPI.DataDependencyCycles * 100 / TotalCycles;
double RegDepPressurePerCycle =
(double)BPI.RegisterDependencyCycles * 100 / TotalCycles;
double MemDepPressurePerCycle =
(double)BPI.MemoryDependencyCycles * 100 / TotalCycles;

OS << "\nCycles with backend pressure increase [ "
<< format("%.2f", floor((PressurePerCycle * 100) + 0.5) / 100) << "% ]";

OS << "\nThroughput Bottlenecks: "
<< "\n Resource Pressure [ "
<< format("%.2f", floor((ResourcePressurePerCycle * 100) + 0.5) / 100)
<< "% ]";

if (BPI.PressureIncreaseCycles) {
for (unsigned I = 0, E = ResourcePressureDistribution.size(); I < E; ++I) {
if (ResourcePressureDistribution[I]) {
double Frequency =
(double)ResourcePressureDistribution[I] * 100 / TotalCycles;
unsigned Index = ResIdx2ProcResID[getResourceStateIndex(1ULL << I)];
const MCProcResourceDesc &PRDesc = *SM.getProcResource(Index);
OS << "\n - " << PRDesc.Name << " [ "
<< format("%.2f", floor((Frequency * 100) + 0.5) / 100) << "% ]";
}
}
}

OS << "\n Data Dependencies: [ "
<< format("%.2f", floor((DDPerCycle * 100) + 0.5) / 100) << "% ]";

OS << "\n - Register Dependencies [ "
<< format("%.2f", floor((RegDepPressurePerCycle * 100) + 0.5) / 100)
<< "% ]";

OS << "\n - Memory Dependencies [ "
<< format("%.2f", floor((MemDepPressurePerCycle * 100) + 0.5) / 100)
<< "% ]\n\n";
}

void SummaryView::printView(raw_ostream &OS) const {
unsigned Instructions = Source.size();
unsigned Iterations = (LastInstructionIdx / Instructions) + 1;
Expand All @@ -85,6 +181,8 @@ void SummaryView::printView(raw_ostream &OS) const {
TempStream << "\nBlock RThroughput: "
<< format("%.1f", floor((BlockRThroughput * 10) + 0.5) / 10)
<< '\n';

printBottleneckHints(TempStream);
TempStream.flush();
OS << Buffer;
}
Expand Down
46 changes: 45 additions & 1 deletion llvm/tools/llvm-mca/Views/SummaryView.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,25 @@ class SummaryView : public View {
unsigned TotalCycles;
// The total number of micro opcodes contributed by a block of instructions.
unsigned NumMicroOps;

struct BackPressureInfo {
// Cycles where backpressure increased.
unsigned PressureIncreaseCycles;
// Cycles where backpressure increased because of pipeline pressure.
unsigned ResourcePressureCycles;
// Cycles where backpressure increased because of data dependencies.
unsigned DataDependencyCycles;
// Cycles where backpressure increased because of register dependencies.
unsigned RegisterDependencyCycles;
// Cycles where backpressure increased because of memory dependencies.
unsigned MemoryDependencyCycles;
};
BackPressureInfo BPI;

// Resource pressure distribution. There is an element for every processor
// resource declared by the scheduling model. Quantities are number of cycles.
llvm::SmallVector<unsigned, 8> ResourcePressureDistribution;

// For each processor resource, this vector stores the cumulative number of
// resource cycles consumed by the analyzed code block.
llvm::SmallVector<unsigned, 8> ProcResourceUsage;
Expand All @@ -58,18 +77,43 @@ class SummaryView : public View {
// Used to map resource indices to actual processor resource IDs.
llvm::SmallVector<unsigned, 8> ResIdx2ProcResID;

// True if resource pressure events were notified during this cycle.
bool PressureIncreasedBecauseOfResources;
bool PressureIncreasedBecauseOfDataDependencies;

// True if throughput was affected by dispatch stalls.
bool SeenStallCycles;

// Compute the reciprocal throughput for the analyzed code block.
// The reciprocal block throughput is computed as the MAX between:
// - NumMicroOps / DispatchWidth
// - Total Resource Cycles / #Units (for every resource consumed).
double getBlockRThroughput() const;

// Prints a bottleneck message to OS.
void printBottleneckHints(llvm::raw_ostream &OS) const;

public:
SummaryView(const llvm::MCSchedModel &Model, llvm::ArrayRef<llvm::MCInst> S,
unsigned Width);

void onCycleEnd() override { ++TotalCycles; }
void onCycleEnd() override {
++TotalCycles;
if (PressureIncreasedBecauseOfResources ||
PressureIncreasedBecauseOfDataDependencies) {
++BPI.PressureIncreaseCycles;
if (PressureIncreasedBecauseOfDataDependencies)
++BPI.DataDependencyCycles;
PressureIncreasedBecauseOfResources = false;
PressureIncreasedBecauseOfDataDependencies = false;
}
}
void onEvent(const HWInstructionEvent &Event) override;
void onEvent(const HWStallEvent &Event) override {
SeenStallCycles = true;
}

void onEvent(const HWPressureEvent &Event) override;

void printView(llvm::raw_ostream &OS) const override;
};
Expand Down
8 changes: 7 additions & 1 deletion llvm/tools/llvm-mca/llvm-mca.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,11 @@ static cl::opt<bool>
cl::desc("Print all views including hardware statistics"),
cl::cat(ViewOptions), cl::init(false));

static cl::opt<bool> EnableBottleneckAnalysis(
"bottleneck-analysis",
cl::desc("Enable bottleneck analysis (disabled by default)"),
cl::cat(ViewOptions), cl::init(false));

namespace {

const Target *getTarget(const char *ProgName) {
Expand Down Expand Up @@ -387,7 +392,8 @@ int main(int argc, char **argv) {
mca::Context MCA(*MRI, *STI);

mca::PipelineOptions PO(Width, RegisterFileSize, LoadQueueSize,
StoreQueueSize, AssumeNoAlias);
StoreQueueSize, AssumeNoAlias,
EnableBottleneckAnalysis);

// Number each region in the sequence.
unsigned RegionIdx = 0;
Expand Down