100 changes: 100 additions & 0 deletions llvm/test/tools/llvm-mca/X86/BtVer2/memcpy-like-test.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=100 -timeline -timeline-max-iterations=1 < %s | FileCheck %s

vmovaps (%rsi), %xmm0
vmovaps %xmm0, (%rdi)
vmovaps 16(%rsi), %xmm0
vmovaps %xmm0, 16(%rdi)
vmovaps 32(%rsi), %xmm0
vmovaps %xmm0, 32(%rdi)
vmovaps 48(%rsi), %xmm0
vmovaps %xmm0, 48(%rdi)


# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 800
# CHECK-NEXT: Total Cycles: 408
# CHECK-NEXT: Dispatch Width: 2
# CHECK-NEXT: IPC: 1.96


# CHECK: Resources:
# CHECK-NEXT: [0] - JALU0
# CHECK-NEXT: [1] - JALU1
# CHECK-NEXT: [2] - JDiv
# CHECK-NEXT: [3] - JFPA
# CHECK-NEXT: [4] - JFPM
# CHECK-NEXT: [5] - JFPU0
# CHECK-NEXT: [6] - JFPU1
# CHECK-NEXT: [7] - JLAGU
# CHECK-NEXT: [8] - JMul
# CHECK-NEXT: [9] - JSAGU
# CHECK-NEXT: [10] - JSTC
# CHECK-NEXT: [11] - JVALU0
# CHECK-NEXT: [12] - JVALU1
# CHECK-NEXT: [13] - JVIMUL


# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
# CHECK-NEXT: - - - - - - - 4.00 - 4.00 - - - -

# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
# CHECK-NEXT: - - - - - - - 1.00 - - - - - - vmovaps (%rsi), %xmm0
# CHECK-NEXT: - - - - - - - - - 1.00 - - - - vmovaps %xmm0, (%rdi)
# CHECK-NEXT: - - - - - - - 1.00 - - - - - - vmovaps 16(%rsi), %xmm0
# CHECK-NEXT: - - - - - - - - - 1.00 - - - - vmovaps %xmm0, 16(%rdi)
# CHECK-NEXT: - - - - - - - 1.00 - - - - - - vmovaps 32(%rsi), %xmm0
# CHECK-NEXT: - - - - - - - - - 1.00 - - - - vmovaps %xmm0, 32(%rdi)
# CHECK-NEXT: - - - - - - - 1.00 - - - - - - vmovaps 48(%rsi), %xmm0
# CHECK-NEXT: - - - - - - - - - 1.00 - - - - vmovaps %xmm0, 48(%rdi)


# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
# CHECK-NEXT: [3]: RThroughput
# CHECK-NEXT: [4]: MayLoad
# CHECK-NEXT: [5]: MayStore
# CHECK-NEXT: [6]: HasSideEffects

# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 5 1.00 * vmovaps (%rsi), %xmm0
# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, (%rdi)
# CHECK-NEXT: 1 5 1.00 * vmovaps 16(%rsi), %xmm0
# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 16(%rdi)
# CHECK-NEXT: 1 5 1.00 * vmovaps 32(%rsi), %xmm0
# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 32(%rdi)
# CHECK-NEXT: 1 5 1.00 * vmovaps 48(%rsi), %xmm0
# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 48(%rdi)


# CHECK: Timeline view:
# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789

# CHECK: [0,0] DeeeeeER .. vmovaps (%rsi), %xmm0
# CHECK-NEXT: [0,1] D=====eER .. vmovaps %xmm0, (%rdi)
# CHECK-NEXT: [0,2] .DeeeeeER .. vmovaps 16(%rsi), %xmm0
# CHECK-NEXT: [0,3] .D=====eER.. vmovaps %xmm0, 16(%rdi)
# CHECK-NEXT: [0,4] . DeeeeeER.. vmovaps 32(%rsi), %xmm0
# CHECK-NEXT: [0,5] . D=====eER. vmovaps %xmm0, 32(%rdi)
# CHECK-NEXT: [0,6] . DeeeeeER. vmovaps 48(%rsi), %xmm0
# CHECK-NEXT: [0,7] . D=====eER vmovaps %xmm0, 48(%rdi)


# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vmovaps (%rsi), %xmm0
# CHECK-NEXT: 1. 1 6.0 0.0 0.0 vmovaps %xmm0, (%rdi)
# CHECK-NEXT: 2. 1 1.0 1.0 0.0 vmovaps 16(%rsi), %xmm0
# CHECK-NEXT: 3. 1 6.0 0.0 0.0 vmovaps %xmm0, 16(%rdi)
# CHECK-NEXT: 4. 1 1.0 1.0 0.0 vmovaps 32(%rsi), %xmm0
# CHECK-NEXT: 5. 1 6.0 0.0 0.0 vmovaps %xmm0, 32(%rdi)
# CHECK-NEXT: 6. 1 1.0 1.0 0.0 vmovaps 48(%rsi), %xmm0
# CHECK-NEXT: 7. 1 6.0 0.0 0.0 vmovaps %xmm0, 48(%rdi)
45 changes: 45 additions & 0 deletions llvm/test/tools/llvm-mca/X86/BtVer2/simple-test.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=100 < %s | FileCheck %s

add %edi, %eax

# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 100
# CHECK-NEXT: Total Cycles: 103
# CHECK-NEXT: Dispatch Width: 2
# CHECK-NEXT: IPC: 0.97

# CHECK-LABEL: Resources:
# CHECK-NEXT: [0] - JALU0
# CHECK-NEXT: [1] - JALU1
# CHECK-NEXT: [2] - JDiv
# CHECK-NEXT: [3] - JFPA
# CHECK-NEXT: [4] - JFPM
# CHECK-NEXT: [5] - JFPU0
# CHECK-NEXT: [6] - JFPU1
# CHECK-NEXT: [7] - JLAGU
# CHECK-NEXT: [8] - JMul
# CHECK-NEXT: [9] - JSAGU
# CHECK-NEXT: [10] - JSTC
# CHECK-NEXT: [11] - JVALU0
# CHECK-NEXT: [12] - JVALU1
# CHECK-NEXT: [13] - JVIMUL


# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - -

# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - - addl %edi, %eax

# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
# CHECK-NEXT: [3]: RThroughput
# CHECK-NEXT: [4]: MayLoad
# CHECK-NEXT: [5]: MayStore
# CHECK-NEXT: [6]: HasSideEffects

# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 1 0.50 addl %edi, %eax
27 changes: 27 additions & 0 deletions llvm/test/tools/llvm-mca/X86/cpus.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 < %s | FileCheck --check-prefix=ALL --check-prefix=BTVER2 %s
# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=znver1 < %s | FileCheck --check-prefix=ALL --check-prefix=ZNVER1 %s
# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge < %s | FileCheck --check-prefix=ALL --check-prefix=SANDYBRIDGE %s
# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=ivybridge < %s | FileCheck --check-prefix=ALL --check-prefix=IVYBRIDGE %s
# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=haswell < %s | FileCheck --check-prefix=ALL --check-prefix=HASWELL %s
# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=broadwell < %s | FileCheck --check-prefix=ALL --check-prefix=BROADWELL %s
# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=knl < %s | FileCheck --check-prefix=ALL --check-prefix=KNL %s
# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=skylake < %s | FileCheck --check-prefix=ALL --check-prefix=SKX %s
# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 < %s | FileCheck --check-prefix=ALL --check-prefix=SKX-AVX512 %s
# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=slm < %s | FileCheck --check-prefix=ALL --check-prefix=SLM %s

add %edi, %eax

# ALL: Iterations: 70
# ALL-NEXT: Instructions: 70

# BTVER2: Dispatch Width: 2
# ZNVER1: Dispatch Width: 4
# SANDYBRIDGE: Dispatch Width: 4
# IVYBRIDGE: Dispatch Width: 4
# HASWELL: Dispatch Width: 4
# BROADWELL: Dispatch Width: 4
# KNL: Dispatch Width: 4
# SKX: Dispatch Width: 6
# SKX-AVX512: Dispatch Width: 6
# SLM: Dispatch Width: 2

11 changes: 11 additions & 0 deletions llvm/test/tools/llvm-mca/X86/default-iterations.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 < %s 2>&1 | FileCheck --check-prefix=DEFAULT %s
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=0 < %s 2>&1 | FileCheck --check-prefix=DEFAULT %s
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 < %s 2>&1 | FileCheck --check-prefix=CUSTOM %s

add %eax, %eax

# DEFAULT: Iterations: 70
# DEFAULT-NEXT: Instructions: 70

# CUSTOM: Iterations: 1
# CUSTOM-NEXT: Instructions: 1
8 changes: 8 additions & 0 deletions llvm/test/tools/llvm-mca/X86/dispatch_width.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 < %s 2>&1 | FileCheck --check-prefix=DEFAULT %s
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -dispatch=0 < %s 2>&1 | FileCheck --check-prefix=DEFAULT %s
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -dispatch=1 < %s 2>&1 | FileCheck --check-prefix=CUSTOM %s

add %eax, %eax

# DEFAULT: Dispatch Width: 2
# CUSTOM: Dispatch Width: 1
3 changes: 3 additions & 0 deletions llvm/test/tools/llvm-mca/X86/in-order-cpu.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# RUN: not llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=atom -o %t1 2>&1 | FileCheck %s

# CHECK: error: please specify an out-of-order cpu. 'atom' is an in-order cpu.
3 changes: 3 additions & 0 deletions llvm/test/tools/llvm-mca/X86/invalid-assembly-sequence.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 %s

invalid_instruction_mnemonic
3 changes: 3 additions & 0 deletions llvm/test/tools/llvm-mca/X86/invalid-cpu.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# RUN: not llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=foo -o %t1 2>&1 | FileCheck %s

# CHECK: 'foo' is not a recognized processor for this target (ignoring processor)
3 changes: 3 additions & 0 deletions llvm/test/tools/llvm-mca/X86/invalid-empty-file.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# RUN: not llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -o %t1 2>&1 | FileCheck %s

# CHECK: error: no assembly instructions found.
3 changes: 3 additions & 0 deletions llvm/test/tools/llvm-mca/X86/lit.local.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
if not 'X86' in config.root.targets:
config.unsupported = True

4 changes: 4 additions & 0 deletions llvm/test/tools/llvm-mca/X86/no-sched-model.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown < %s 2>&1 | FileCheck %s
# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=generic < %s 2>&1 | FileCheck %s

# CHECK: error: unable to find instruction-level scheduling information for target triple 'x86_64-unknown-unknown' and cpu 'generic'.
3 changes: 3 additions & 0 deletions llvm/test/tools/llvm-mca/invalid_input_file_name.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# RUN: not llvm-mca %t.blah -o %t2 2>&1 | FileCheck --check-prefix=ENOENT %s

# ENOENT: {{.*}}.blah: {{[Nn]}}o such file or directory
4 changes: 4 additions & 0 deletions llvm/test/tools/llvm-mca/lit.local.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Requires a non-empty default triple for these tests
if 'default_triple' not in config.available_features:
config.unsupported = True

1 change: 1 addition & 0 deletions llvm/tools/LLVMBuild.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ subdirectories =
llvm-link
llvm-lto
llvm-mc
llvm-mca
llvm-mcmarkup
llvm-modextract
llvm-mt
Expand Down
132 changes: 132 additions & 0 deletions llvm/tools/llvm-mca/Backend.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
//===--------------------- Backend.cpp --------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
/// \file
///
/// Implementation of class Backend which emulates an hardware OoO backend.
///
//===----------------------------------------------------------------------===//

#include "Backend.h"
#include "HWEventListener.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/Support/Debug.h"

namespace mca {

#define DEBUG_TYPE "llvm-mca"

using namespace llvm;

void Backend::addEventListener(HWEventListener *Listener) {
if (Listener)
Listeners.insert(Listener);
}

void Backend::runCycle(unsigned Cycle) {
notifyCycleBegin(Cycle);

if (!SM->hasNext()) {
notifyCycleEnd(Cycle);
return;
}

InstRef IR = SM->peekNext();
const InstrDesc *Desc = &IB->getOrCreateInstrDesc(STI, *IR.second);
while (DU->isAvailable(Desc->NumMicroOps) && DU->canDispatch(*Desc)) {
Instruction *NewIS = IB->createInstruction(STI, *DU, IR.first, *IR.second);
Instructions[IR.first] = std::unique_ptr<Instruction>(NewIS);
NewIS->setRCUTokenID(DU->dispatch(IR.first, NewIS));

// If this is a zero latency instruction, then we don't need to dispatch
// it. Instead, we can mark it as executed.
if (NewIS->isZeroLatency())
notifyInstructionExecuted(IR.first);

// Check if we have dispatched all the instructions.
SM->updateNext();
if (!SM->hasNext())
break;

// Prepare for the next round.
IR = SM->peekNext();
Desc = &IB->getOrCreateInstrDesc(STI, *IR.second);
}

notifyCycleEnd(Cycle);
}

void Backend::notifyCycleBegin(unsigned Cycle) {
DEBUG(dbgs() << "[E] Cycle begin: " << Cycle << '\n');
for (HWEventListener *Listener : Listeners)
Listener->onCycleBegin(Cycle);

DU->cycleEvent(Cycle);
HWS->cycleEvent(Cycle);
}

void Backend::notifyInstructionDispatched(unsigned Index) {
DEBUG(dbgs() << "[E] Instruction Dispatched: " << Index << '\n');
for (HWEventListener *Listener : Listeners)
Listener->onInstructionDispatched(Index);
}

void Backend::notifyInstructionReady(unsigned Index) {
DEBUG(dbgs() << "[E] Instruction Ready: " << Index << '\n');
for (HWEventListener *Listener : Listeners)
Listener->onInstructionReady(Index);
}

void Backend::notifyInstructionIssued(
unsigned Index, const ArrayRef<std::pair<ResourceRef, unsigned>> &Used) {
DEBUG(
dbgs() << "[E] Instruction Issued: " << Index << '\n';
for (const std::pair<ResourceRef, unsigned> &Resource : Used) {
dbgs() << "[E] Resource Used: [" << Resource.first.first << '.'
<< Resource.first.second << "]\n";
dbgs() << " cycles: " << Resource.second << '\n';
}
);

for (HWEventListener *Listener : Listeners)
Listener->onInstructionIssued(Index, Used);
}

void Backend::notifyInstructionExecuted(unsigned Index) {
DEBUG(dbgs() << "[E] Instruction Executed: " << Index << '\n');
for (HWEventListener *Listener : Listeners)
Listener->onInstructionExecuted(Index);

const Instruction &IS = *Instructions[Index];
DU->onInstructionExecuted(IS.getRCUTokenID());
}

void Backend::notifyInstructionRetired(unsigned Index) {
DEBUG(dbgs() << "[E] Instruction Retired: " << Index << '\n');
for (HWEventListener *Listener : Listeners)
Listener->onInstructionRetired(Index);

const Instruction &IS = *Instructions[Index];
DU->invalidateRegisterMappings(IS);
Instructions.erase(Index);
}

void Backend::notifyResourceAvailable(const ResourceRef &RR) {
DEBUG(dbgs() << "[E] Resource Available: [" << RR.first << '.' << RR.second
<< "]\n");
for (HWEventListener *Listener : Listeners)
Listener->onResourceAvailable(RR);
}

void Backend::notifyCycleEnd(unsigned Cycle) {
DEBUG(dbgs() << "[E] Cycle end: " << Cycle << "\n\n");
for (HWEventListener *Listener : Listeners)
Listener->onCycleEnd(Cycle);
}

} // namespace mca.
141 changes: 141 additions & 0 deletions llvm/tools/llvm-mca/Backend.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
//===--------------------- Backend.h ----------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
/// \file
///
/// This file implements an OoO backend for the llvm-mca tool.
///
//===----------------------------------------------------------------------===//

#ifndef LLVM_TOOLS_LLVM_MCA_BACKEND_H
#define LLVM_TOOLS_LLVM_MCA_BACKEND_H

#include "Dispatch.h"
#include "InstrBuilder.h"
#include "Scheduler.h"
#include "SourceMgr.h"

namespace mca {

struct HWEventListener;

/// \brief An out of order backend for a specific subtarget.
///
/// It emulates an out-of-order execution of instructions. Instructions are
/// fetched from a MCInst sequence managed by an object of class SourceMgr.
/// Instructions are firstly dispatched to the schedulers and then executed.
/// This class tracks the lifetime of an instruction from the moment where
/// it gets dispatched to the schedulers, to the moment where it finishes
/// executing and register writes are architecturally committed.
/// In particular, it monitors changes in the state of every instruction
/// in flight.
/// Instructions are executed in a loop of iterations. The number of iterations
/// is defined by the SourceMgr object.
/// The Backend entrypoint is method 'Run()' which execute cycles in a loop
/// until there are new instructions to dispatch, and not every instruction
/// has been retired.
/// Internally, the Backend collects statistical information in the form of
/// histograms. For example, it tracks how the dispatch group size changes
/// over time.
class Backend {
const llvm::MCSubtargetInfo &STI;

std::unique_ptr<InstrBuilder> IB;
std::unique_ptr<Scheduler> HWS;
std::unique_ptr<DispatchUnit> DU;
std::unique_ptr<SourceMgr> SM;
unsigned Cycles;

llvm::DenseMap<unsigned, std::unique_ptr<Instruction>> Instructions;
std::set<HWEventListener *> Listeners;

void runCycle(unsigned Cycle);

public:
Backend(const llvm::MCSubtargetInfo &Subtarget, const llvm::MCInstrInfo &MCII,
const llvm::MCRegisterInfo &MRI, std::unique_ptr<SourceMgr> Source,
unsigned DispatchWidth = 0, unsigned RegisterFileSize = 0,
unsigned MaxRetirePerCycle = 0, unsigned LoadQueueSize = 0,
unsigned StoreQueueSize = 0, bool AssumeNoAlias = false)
: STI(Subtarget),
HWS(llvm::make_unique<Scheduler>(this, Subtarget.getSchedModel(),
LoadQueueSize, StoreQueueSize,
AssumeNoAlias)),
DU(llvm::make_unique<DispatchUnit>(
this, MRI, Subtarget.getSchedModel().MicroOpBufferSize,
RegisterFileSize, MaxRetirePerCycle, DispatchWidth, HWS.get())),
SM(std::move(Source)), Cycles(0) {
IB = llvm::make_unique<InstrBuilder>(MCII, getProcResourceMasks());
}

void run() {
while (SM->hasNext() || !DU->isRCUEmpty())
runCycle(Cycles++);
}

unsigned getNumIterations() const { return SM->getNumIterations(); }
unsigned getNumInstructions() const { return SM->size(); }
unsigned getNumCycles() const { return Cycles; }
unsigned getTotalRegisterMappingsCreated() const {
return DU->getTotalRegisterMappingsCreated();
}
unsigned getMaxUsedRegisterMappings() const {
return DU->getMaxUsedRegisterMappings();
}
unsigned getDispatchWidth() const { return DU->getDispatchWidth(); }

const llvm::MCSubtargetInfo &getSTI() const { return STI; }
const llvm::MCSchedModel &getSchedModel() const {
return STI.getSchedModel();
}
const llvm::ArrayRef<uint64_t> getProcResourceMasks() const {
return HWS->getProcResourceMasks();
}

double getRThroughput(const InstrDesc &ID) const {
return HWS->getRThroughput(ID);
}
void getBuffersUsage(std::vector<BufferUsageEntry> &Usage) const {
return HWS->getBuffersUsage(Usage);
}

unsigned getNumRATStalls() const { return DU->getNumRATStalls(); }
unsigned getNumRCUStalls() const { return DU->getNumRCUStalls(); }
unsigned getNumSQStalls() const { return DU->getNumSQStalls(); }
unsigned getNumLDQStalls() const { return DU->getNumLDQStalls(); }
unsigned getNumSTQStalls() const { return DU->getNumSTQStalls(); }
unsigned getNumDispatchGroupStalls() const {
return DU->getNumDispatchGroupStalls();
}

const llvm::MCInst &getMCInstFromIndex(unsigned Index) const {
return SM->getMCInstFromIndex(Index);
}

const InstrDesc &getInstrDesc(const llvm::MCInst &Inst) const {
return IB->getOrCreateInstrDesc(STI, Inst);
}

const SourceMgr &getSourceMgr() const { return *SM; }

void addEventListener(HWEventListener *Listener);
void notifyCycleBegin(unsigned Cycle);
void notifyInstructionDispatched(unsigned Index);
void notifyInstructionReady(unsigned Index);
void notifyInstructionIssued(
unsigned Index,
const llvm::ArrayRef<std::pair<ResourceRef, unsigned>> &Used);
void notifyInstructionExecuted(unsigned Index);
void notifyResourceAvailable(const ResourceRef &RR);
void notifyInstructionRetired(unsigned Index);
void notifyCycleEnd(unsigned Cycle);
};

} // namespace mca

#endif
209 changes: 209 additions & 0 deletions llvm/tools/llvm-mca/BackendPrinter.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
//===--------------------- BackendPrinter.cpp -------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
/// \file
///
/// This file implements the BackendPrinter interface.
///
//===----------------------------------------------------------------------===//

#include "BackendPrinter.h"
#include "llvm/CodeGen/TargetSchedule.h"

namespace mca {

using namespace llvm;

std::unique_ptr<ToolOutputFile>
BackendPrinter::getOutputStream(std::string OutputFile) {
if (OutputFile == "")
OutputFile = "-";
std::error_code EC;
auto Out = llvm::make_unique<ToolOutputFile>(OutputFile, EC, sys::fs::F_None);
if (!EC)
return Out;
errs() << EC.message() << '\n';
return nullptr;
}

void BackendPrinter::printGeneralStatistics(unsigned Iterations,
unsigned Cycles,
unsigned Instructions,
unsigned DispatchWidth) const {
unsigned TotalInstructions = Instructions * Iterations;
double IPC = (double)TotalInstructions / Cycles;

std::string Buffer;
raw_string_ostream TempStream(Buffer);
TempStream << "Iterations: " << Iterations;
TempStream << "\nInstructions: " << TotalInstructions;
TempStream << "\nTotal Cycles: " << Cycles;
TempStream << "\nDispatch Width: " << DispatchWidth;
TempStream << "\nIPC: " << format("%.2f", IPC) << '\n';
TempStream.flush();
File->os() << Buffer;
}

void BackendPrinter::printRATStatistics(unsigned TotalMappings,
unsigned MaxUsedMappings) const {
std::string Buffer;
raw_string_ostream TempStream(Buffer);
TempStream << "\n\nRegister Alias Table:";
TempStream << "\nTotal number of mappings created: " << TotalMappings;
TempStream << "\nMax number of mappings used: " << MaxUsedMappings
<< '\n';
TempStream.flush();
File->os() << Buffer;
}

void BackendPrinter::printDispatchStalls(unsigned RATStalls, unsigned RCUStalls,
unsigned SCHEDQStalls,
unsigned LDQStalls, unsigned STQStalls,
unsigned DGStalls) const {
std::string Buffer;
raw_string_ostream TempStream(Buffer);
TempStream << "\n\nDynamic Dispatch Stall Cycles:\n";
TempStream << "RAT - Register unavailable: "
<< RATStalls;
TempStream << "\nRCU - Retire tokens unavailable: "
<< RCUStalls;
TempStream << "\nSCHEDQ - Scheduler full: "
<< SCHEDQStalls;
TempStream << "\nLQ - Load queue full: "
<< LDQStalls;
TempStream << "\nSQ - Store queue full: "
<< STQStalls;
TempStream << "\nGROUP - Static restrictions on the dispatch group: "
<< DGStalls;
TempStream << '\n';
TempStream.flush();
File->os() << Buffer;
}

void BackendPrinter::printSchedulerUsage(
const MCSchedModel &SM, const ArrayRef<BufferUsageEntry> &Usage) const {
std::string Buffer;
raw_string_ostream TempStream(Buffer);
TempStream << "\n\nScheduler's queue usage:\n";
const ArrayRef<uint64_t> ResourceMasks = B.getProcResourceMasks();
for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
const MCProcResourceDesc &ProcResource = *SM.getProcResource(I);
if (!ProcResource.BufferSize)
continue;

for (const BufferUsageEntry &Entry : Usage)
if (ResourceMasks[I] == Entry.first)
TempStream << ProcResource.Name << ", " << Entry.second << '/'
<< ProcResource.BufferSize << '\n';
}

TempStream.flush();
File->os() << Buffer;
}

void BackendPrinter::printInstructionInfo() const {
std::string Buffer;
raw_string_ostream TempStream(Buffer);

TempStream << "\n\nInstruction Info:\n";
TempStream << "[1]: #uOps\n[2]: Latency\n[3]: RThroughput\n"
<< "[4]: MayLoad\n[5]: MayStore\n[6]: HasSideEffects\n\n";

TempStream << "[1] [2] [3] [4] [5] [6]\tInstructions:\n";
for (unsigned I = 0, E = B.getNumInstructions(); I < E; ++I) {
const MCInst &Inst = B.getMCInstFromIndex(I);
const InstrDesc &ID = B.getInstrDesc(Inst);
unsigned NumMicroOpcodes = ID.NumMicroOps;
unsigned Latency = ID.MaxLatency;
double RThroughput = B.getRThroughput(ID);
TempStream << ' ' << NumMicroOpcodes << " ";
if (NumMicroOpcodes < 10)
TempStream << " ";
else if (NumMicroOpcodes < 100)
TempStream << ' ';
TempStream << Latency << " ";
if (Latency < 10.0)
TempStream << " ";
else if (Latency < 100.0)
TempStream << ' ';
if (RThroughput) {
TempStream << format("%.2f", RThroughput) << ' ';
if (RThroughput < 10.0)
TempStream << " ";
else if (RThroughput < 100.0)
TempStream << ' ';
} else {
TempStream << " - ";
}
TempStream << (ID.MayLoad ? " * " : " ");
TempStream << (ID.MayStore ? " * " : " ");
TempStream << (ID.HasSideEffects ? " * " : " ");
MCIP->printInst(&Inst, TempStream, "", B.getSTI());
TempStream << '\n';
}

TempStream.flush();
File->os() << Buffer;
}

void BackendPrinter::printReport() const {
assert(isFileValid());
unsigned Cycles = B.getNumCycles();
printGeneralStatistics(B.getNumIterations(), Cycles, B.getNumInstructions(),
B.getDispatchWidth());
if (EnableVerboseOutput) {
printDispatchStalls(B.getNumRATStalls(), B.getNumRCUStalls(),
B.getNumSQStalls(), B.getNumLDQStalls(),
B.getNumSTQStalls(), B.getNumDispatchGroupStalls());
printRATStatistics(B.getTotalRegisterMappingsCreated(),
B.getMaxUsedRegisterMappings());
BS->printHistograms(File->os());

std::vector<BufferUsageEntry> Usage;
B.getBuffersUsage(Usage);
printSchedulerUsage(B.getSchedModel(), Usage);
}

if (RPV) {
RPV->printResourcePressure(getOStream(), Cycles);
printInstructionInfo();
}

if (TV) {
TV->printTimeline(getOStream());
TV->printAverageWaitTimes(getOStream());
}
}

void BackendPrinter::addResourcePressureView() {
if (!RPV) {
RPV = llvm::make_unique<ResourcePressureView>(
B.getSTI(), *MCIP, B.getSourceMgr(), B.getProcResourceMasks());
B.addEventListener(RPV.get());
}
}

void BackendPrinter::addTimelineView(unsigned MaxIterations,
unsigned MaxCycles) {
if (!TV) {
TV = llvm::make_unique<TimelineView>(B.getSTI(), *MCIP, B.getSourceMgr(),
MaxIterations, MaxCycles);
B.addEventListener(TV.get());
}
}

void BackendPrinter::initialize(std::string OutputFileName) {
File = getOutputStream(OutputFileName);
MCIP->setPrintImmHex(false);
if (EnableVerboseOutput) {
BS = llvm::make_unique<BackendStatistics>();
B.addEventListener(BS.get());
}
}

} // namespace mca.
102 changes: 102 additions & 0 deletions llvm/tools/llvm-mca/BackendPrinter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
//===--------------------- BackendPrinter.h ---------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
/// \file
///
/// This file implements class BackendPrinter.
/// BackendPrinter is able to collect statistics related to the code executed
/// by the Backend class. Information is then printed out with the help of
/// a MCInstPrinter (to pretty print MCInst objects) and other helper classes.
///
//===----------------------------------------------------------------------===//

#ifndef LLVM_TOOLS_LLVM_MCA_BACKENDPRINTER_H
#define LLVM_TOOLS_LLVM_MCA_BACKENDPRINTER_H

#include "Backend.h"
#include "BackendStatistics.h"
#include "ResourcePressureView.h"
#include "TimelineView.h"
#include "llvm/MC/MCInstPrinter.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/FileUtilities.h"
#include "llvm/Support/ToolOutputFile.h"

#define DEBUG_TYPE "llvm-mca"

namespace mca {

class ResourcePressureView;
class TimelineView;

/// \brief A printer class that knows how to collects statistics on the
/// code analyzed by the llvm-mca tool.
///
/// This class knows how to print out the analysis information collected
/// during the execution of the code. Internally, it delegates to other
/// classes the task of printing out timeline information as well as
/// resource pressure.
class BackendPrinter {
Backend &B;
bool EnableVerboseOutput;

std::unique_ptr<llvm::MCInstPrinter> MCIP;
std::unique_ptr<llvm::ToolOutputFile> File;

std::unique_ptr<ResourcePressureView> RPV;
std::unique_ptr<TimelineView> TV;
std::unique_ptr<BackendStatistics> BS;

using Histogram = std::map<unsigned, unsigned>;
void printDUStatistics(const Histogram &Stats, unsigned Cycles) const;
void printDispatchStalls(unsigned RATStalls, unsigned RCUStalls,
unsigned SQStalls, unsigned LDQStalls,
unsigned STQStalls, unsigned DGStalls) const;
void printRATStatistics(unsigned Mappings, unsigned MaxUsedMappings) const;
void printRCUStatistics(const Histogram &Histogram, unsigned Cycles) const;
void printIssuePerCycle(const Histogram &IssuePerCycle,
unsigned TotalCycles) const;
void printSchedulerUsage(const llvm::MCSchedModel &SM,
const llvm::ArrayRef<BufferUsageEntry> &Usage) const;
void printGeneralStatistics(unsigned Iterations, unsigned Cycles,
unsigned Instructions,
unsigned DispatchWidth) const;
void printInstructionInfo() const;

std::unique_ptr<llvm::ToolOutputFile> getOutputStream(std::string OutputFile);
void initialize(std::string OputputFileName);

public:
BackendPrinter(Backend &backend, std::string OutputFileName,
std::unique_ptr<llvm::MCInstPrinter> IP, bool EnableVerbose)
: B(backend), EnableVerboseOutput(EnableVerbose), MCIP(std::move(IP)) {
initialize(OutputFileName);
}

~BackendPrinter() {
if (File)
File->keep();
}

bool isFileValid() const { return File.get(); }
llvm::raw_ostream &getOStream() const {
assert(isFileValid());
return File->os();
}

llvm::MCInstPrinter &getMCInstPrinter() const { return *MCIP; }

void addResourcePressureView();
void addTimelineView(unsigned MaxIterations = 3, unsigned MaxCycles = 80);

void printReport() const;
};

} // namespace mca

#endif
79 changes: 79 additions & 0 deletions llvm/tools/llvm-mca/BackendStatistics.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
//===--------------------- BackendStatistics.cpp ---------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
/// \file
///
/// Functionalities used by the BackendPrinter to print out histograms
/// related to number of {dispatch/issue/retire} per number of cycles.
///
//===----------------------------------------------------------------------===//

#include "BackendStatistics.h"
#include "llvm/Support/Format.h"

using namespace llvm;

namespace mca {

void BackendStatistics::printRetireUnitStatistics(llvm::raw_ostream &OS) const {
std::string Buffer;
raw_string_ostream TempStream(Buffer);
TempStream << "\n\nRetire Control Unit - "
<< "number of cycles where we saw N instructions retired:\n";
TempStream << "[# retired], [# cycles]\n";

for (const std::pair<unsigned, unsigned> &Entry : RetiredPerCycle) {
TempStream << " " << Entry.first;
if (Entry.first < 10)
TempStream << ", ";
else
TempStream << ", ";
TempStream << Entry.second << " ("
<< format("%.1f", ((double)Entry.second / NumCycles) * 100.0)
<< "%)\n";
}

TempStream.flush();
OS << Buffer;
}

void BackendStatistics::printDispatchUnitStatistics(llvm::raw_ostream &OS) const {
std::string Buffer;
raw_string_ostream TempStream(Buffer);
TempStream << "\n\nDispatch Logic - "
<< "number of cycles where we saw N instructions dispatched:\n";
TempStream << "[# dispatched], [# cycles]\n";
for (const std::pair<unsigned, unsigned> &Entry : DispatchGroupSizePerCycle) {
TempStream << " " << Entry.first << ", " << Entry.second
<< " ("
<< format("%.1f", ((double)Entry.second / NumCycles) * 100.0)
<< "%)\n";
}

TempStream.flush();
OS << Buffer;
}

void BackendStatistics::printSchedulerStatistics(llvm::raw_ostream &OS) const {
std::string Buffer;
raw_string_ostream TempStream(Buffer);
TempStream << "\n\nSchedulers - number of cycles where we saw N instructions "
"issued:\n";
TempStream << "[# issued], [# cycles]\n";
for (const std::pair<unsigned, unsigned> &Entry : IssuedPerCycle) {
TempStream << " " << Entry.first << ", " << Entry.second << " ("
<< format("%.1f", ((double)Entry.second / NumCycles) * 100)
<< "%)\n";
}

TempStream.flush();
OS << Buffer;
}

} // namespace mca

95 changes: 95 additions & 0 deletions llvm/tools/llvm-mca/BackendStatistics.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
//===--------------------- BackendStatistics.h ------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
/// \file
///
/// This file implements a printer class for printing generic Backend
/// statistics related to the dispatch logic, scheduler and retire unit.
///
/// Example:
/// ========
///
/// Dispatch Logic - number of cycles where we saw N instructions dispatched:
/// [# dispatched], [# cycles]
/// 0, 15 (11.5%)
/// 5, 4 (3.1%)
///
/// Schedulers - number of cycles where we saw N instructions issued:
/// [# issued], [# cycles]
/// 0, 7 (5.4%)
/// 1, 4 (3.1%)
/// 2, 8 (6.2%)
///
/// Retire Control Unit - number of cycles where we saw N instructions retired:
/// [# retired], [# cycles]
/// 0, 9 (6.9%)
/// 1, 6 (4.6%)
/// 2, 1 (0.8%)
/// 4, 3 (2.3%)
///
//===----------------------------------------------------------------------===//

#ifndef LLVM_TOOLS_LLVM_MCA_BACKENDSTATISTICS_H
#define LLVM_TOOLS_LLVM_MCA_BACKENDSTATISTICS_H

#include "HWEventListener.h"
#include "llvm/Support/raw_ostream.h"
#include <map>

namespace mca {

class BackendStatistics : public HWEventListener {
using Histogram = std::map<unsigned, unsigned>;
Histogram DispatchGroupSizePerCycle;
Histogram RetiredPerCycle;
Histogram IssuedPerCycle;

unsigned NumDispatched;
unsigned NumIssued;
unsigned NumRetired;
unsigned NumCycles;

void updateHistograms() {
DispatchGroupSizePerCycle[NumDispatched]++;
IssuedPerCycle[NumIssued]++;
RetiredPerCycle[NumRetired]++;
NumDispatched = 0;
NumIssued = 0;
NumRetired = 0;
}

void printRetireUnitStatistics(llvm::raw_ostream &OS) const;
void printDispatchUnitStatistics(llvm::raw_ostream &OS) const;
void printSchedulerStatistics(llvm::raw_ostream &OS) const;

public:
BackendStatistics() : NumDispatched(0), NumIssued(0), NumRetired(0) {}

void onInstructionDispatched(unsigned Index) override { NumDispatched++; }
void
onInstructionIssued(unsigned Index,
const llvm::ArrayRef<std::pair<ResourceRef, unsigned>>
& /* unused */) override {
NumIssued++;
}
void onInstructionRetired(unsigned Index) override { NumRetired++; }

void onCycleBegin(unsigned Cycle) override { NumCycles++; }

void onCycleEnd(unsigned Cycle) override { updateHistograms(); }

void printHistograms(llvm::raw_ostream &OS) {
printDispatchUnitStatistics(OS);
printSchedulerStatistics(OS);
printRetireUnitStatistics(OS);
}
};

} // namespace mca

#endif
25 changes: 25 additions & 0 deletions llvm/tools/llvm-mca/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
set(LLVM_LINK_COMPONENTS
AllTargetsAsmPrinters
AllTargetsAsmParsers
AllTargetsDescs
AllTargetsDisassemblers
AllTargetsInfos
MC
MCParser
Support
)

add_llvm_tool(llvm-mca
Backend.cpp
BackendPrinter.cpp
BackendStatistics.cpp
Dispatch.cpp
HWEventListener.cpp
InstrBuilder.cpp
Instruction.cpp
LSUnit.cpp
llvm-mca.cpp
ResourcePressureView.cpp
Scheduler.cpp
TimelineView.cpp
)
268 changes: 268 additions & 0 deletions llvm/tools/llvm-mca/Dispatch.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
//===--------------------- Dispatch.cpp -------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
/// \file
///
/// This file implements methods declared by class RegisterFile, DispatchUnit
/// and RetireControlUnit.
///
//===----------------------------------------------------------------------===//

#include "Dispatch.h"
#include "Backend.h"
#include "Scheduler.h"
#include "llvm/Support/Debug.h"

using namespace llvm;

#define DEBUG_TYPE "llvm-mca"

namespace mca {

void RegisterFile::addRegisterMapping(WriteState &WS) {
unsigned RegID = WS.getRegisterID();
assert(RegID && "Adding an invalid register definition?");

RegisterMappings[RegID] = &WS;
for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I)
RegisterMappings[*I] = &WS;
if (MaxUsedMappings == NumUsedMappings)
MaxUsedMappings++;
NumUsedMappings++;
TotalMappingsCreated++;
// If this is a partial update, then we are done.
if (!WS.fullyUpdatesSuperRegs())
return;

for (MCSuperRegIterator I(RegID, &MRI); I.isValid(); ++I)
RegisterMappings[*I] = &WS;
}

void RegisterFile::invalidateRegisterMapping(const WriteState &WS) {
unsigned RegID = WS.getRegisterID();
bool ShouldInvalidateSuperRegs = WS.fullyUpdatesSuperRegs();

assert(RegID != 0 && "Invalidating an already invalid register?");
assert(WS.getCyclesLeft() != -512 &&
"Invalidating a write of unknown cycles!");
assert(WS.getCyclesLeft() <= 0 && "Invalid cycles left for this write!");
if (!RegisterMappings[RegID])
return;

assert(NumUsedMappings);
NumUsedMappings--;

if (RegisterMappings[RegID] == &WS)
RegisterMappings[RegID] = nullptr;

for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I)
if (RegisterMappings[*I] == &WS)
RegisterMappings[*I] = nullptr;

if (!ShouldInvalidateSuperRegs)
return;

for (MCSuperRegIterator I(RegID, &MRI); I.isValid(); ++I)
if (RegisterMappings[*I] == &WS)
RegisterMappings[*I] = nullptr;
}

// Update the number of used mappings in the event of instruction retired.
// This mehod delegates to the register file the task of invalidating
// register mappings that were created for instruction IS.
void DispatchUnit::invalidateRegisterMappings(const Instruction &IS) {
for (const std::unique_ptr<WriteState> &WS : IS.getDefs()) {
DEBUG(dbgs() << "[RAT] Invalidating mapping for: ");
DEBUG(WS->dump());
RAT->invalidateRegisterMapping(*WS.get());
}
}

void RegisterFile::collectWrites(SmallVectorImpl<WriteState *> &Writes,
unsigned RegID) const {
assert(RegID && RegID < RegisterMappings.size());
WriteState *WS = RegisterMappings[RegID];
if (WS) {
DEBUG(dbgs() << "Found a dependent use of RegID=" << RegID << '\n');
Writes.push_back(WS);
}

// Handle potential partial register updates.
for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I) {
WS = RegisterMappings[*I];
if (WS && std::find(Writes.begin(), Writes.end(), WS) == Writes.end()) {
DEBUG(dbgs() << "Found a dependent use of subReg " << *I << " (part of "
<< RegID << ")\n");
Writes.push_back(WS);
}
}
}

bool RegisterFile::isAvailable(unsigned NumRegWrites) {
if (!TotalMappings)
return true;
if (NumRegWrites > TotalMappings) {
// The user specified a too small number of registers.
// Artificially set the number of temporaries to NumRegWrites.
errs() << "warning: not enough temporaries in the register file. "
<< "The register file size has been automatically increased to "
<< NumRegWrites << '\n';
TotalMappings = NumRegWrites;
}

return NumRegWrites + NumUsedMappings <= TotalMappings;
}

#ifndef NDEBUG
void RegisterFile::dump() const {
for (unsigned I = 0, E = MRI.getNumRegs(); I < E; ++I)
if (RegisterMappings[I]) {
dbgs() << MRI.getName(I) << ", " << I << ", ";
RegisterMappings[I]->dump();
}

dbgs() << "TotalMappingsCreated: " << TotalMappingsCreated
<< ", MaxUsedMappings: " << MaxUsedMappings
<< ", NumUsedMappings: " << NumUsedMappings << '\n';
}
#endif

// Reserves a number of slots, and returns a new token.
unsigned RetireControlUnit::reserveSlot(unsigned Index, unsigned NumMicroOps) {
assert(isAvailable(NumMicroOps));
unsigned NormalizedQuantity =
std::min(NumMicroOps, static_cast<unsigned>(Queue.size()));
// Zero latency instructions may have zero mOps. Artificially bump this
// value to 1. Although zero latency instructions don't consume scheduler
// resources, they still consume one slot in the retire queue.
NormalizedQuantity = std::max(NormalizedQuantity, 1U);
unsigned TokenID = NextAvailableSlotIdx;
Queue[NextAvailableSlotIdx] = {Index, NormalizedQuantity, false};
NextAvailableSlotIdx += NormalizedQuantity;
NextAvailableSlotIdx %= Queue.size();
AvailableSlots -= NormalizedQuantity;
return TokenID;
}

void DispatchUnit::notifyInstructionDispatched(unsigned Index) {
Owner->notifyInstructionDispatched(Index);
}

void DispatchUnit::notifyInstructionRetired(unsigned Index) {
Owner->notifyInstructionRetired(Index);
}

void RetireControlUnit::cycleEvent() {
if (isEmpty())
return;

unsigned NumRetired = 0;
while (!isEmpty()) {
if (MaxRetirePerCycle != 0 && NumRetired == MaxRetirePerCycle)
break;
RUToken &Current = Queue[CurrentInstructionSlotIdx];
assert(Current.NumSlots && "Reserved zero slots?");
if (!Current.Executed)
break;
Owner->notifyInstructionRetired(Current.Index);
CurrentInstructionSlotIdx += Current.NumSlots;
CurrentInstructionSlotIdx %= Queue.size();
AvailableSlots += Current.NumSlots;
NumRetired++;
}
}

void RetireControlUnit::onInstructionExecuted(unsigned TokenID) {
assert(Queue.size() > TokenID);
assert(Queue[TokenID].Executed == false && Queue[TokenID].Index != ~0U);
Queue[TokenID].Executed = true;
}

#ifndef NDEBUG
void RetireControlUnit::dump() const {
dbgs() << "Retire Unit: { Total Slots=" << Queue.size()
<< ", Available Slots=" << AvailableSlots << " }\n";
}
#endif

bool DispatchUnit::checkRAT(const InstrDesc &Desc) {
unsigned NumWrites = Desc.Writes.size();
if (RAT->isAvailable(NumWrites))
return true;
DispatchStalls[DS_RAT_REG_UNAVAILABLE]++;
return false;
}

bool DispatchUnit::checkRCU(const InstrDesc &Desc) {
unsigned NumMicroOps = Desc.NumMicroOps;
if (RCU->isAvailable(NumMicroOps))
return true;
DispatchStalls[DS_RCU_TOKEN_UNAVAILABLE]++;
return false;
}

bool DispatchUnit::checkScheduler(const InstrDesc &Desc) {
// If this is a zero-latency instruction, then it bypasses
// the scheduler.
switch (SC->canBeDispatched(Desc)) {
case Scheduler::HWS_AVAILABLE:
return true;
case Scheduler::HWS_QUEUE_UNAVAILABLE:
DispatchStalls[DS_SQ_TOKEN_UNAVAILABLE]++;
break;
case Scheduler::HWS_LD_QUEUE_UNAVAILABLE:
DispatchStalls[DS_LDQ_TOKEN_UNAVAILABLE]++;
break;
case Scheduler::HWS_ST_QUEUE_UNAVAILABLE:
DispatchStalls[DS_STQ_TOKEN_UNAVAILABLE]++;
break;
case Scheduler::HWS_DISPATCH_GROUP_RESTRICTION:
DispatchStalls[DS_DISPATCH_GROUP_RESTRICTION]++;
}

return false;
}

unsigned DispatchUnit::dispatch(unsigned IID, Instruction *NewInst) {
assert(!CarryOver && "Cannot dispatch another instruction!");
unsigned NumMicroOps = NewInst->getDesc().NumMicroOps;
if (NumMicroOps > DispatchWidth) {
assert(AvailableEntries == DispatchWidth);
AvailableEntries = 0;
CarryOver = NumMicroOps - DispatchWidth;
} else {
assert(AvailableEntries >= NumMicroOps);
AvailableEntries -= NumMicroOps;
}

// Reserve slots in the RCU.
unsigned RCUTokenID = RCU->reserveSlot(IID, NumMicroOps);
Owner->notifyInstructionDispatched(IID);

SC->scheduleInstruction(IID, NewInst);
return RCUTokenID;
}

#ifndef NDEBUG
void DispatchUnit::dump() const {
RAT->dump();
RCU->dump();

unsigned DSRAT = DispatchStalls[DS_RAT_REG_UNAVAILABLE];
unsigned DSRCU = DispatchStalls[DS_RCU_TOKEN_UNAVAILABLE];
unsigned DSSCHEDQ = DispatchStalls[DS_SQ_TOKEN_UNAVAILABLE];
unsigned DSLQ = DispatchStalls[DS_LDQ_TOKEN_UNAVAILABLE];
unsigned DSSQ = DispatchStalls[DS_STQ_TOKEN_UNAVAILABLE];

dbgs() << "STALLS --- RAT: " << DSRAT << ", RCU: " << DSRCU
<< ", SCHED_QUEUE: " << DSSCHEDQ << ", LOAD_QUEUE: " << DSLQ
<< ", STORE_QUEUE: " << DSSQ << '\n';
}
#endif

} // namespace mca
319 changes: 319 additions & 0 deletions llvm/tools/llvm-mca/Dispatch.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,319 @@
//===----------------------- Dispatch.h -------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
/// \file
///
/// This file implements classes that are used to model register files,
/// reorder buffers and the hardware dispatch logic.
///
//===----------------------------------------------------------------------===//

#ifndef LLVM_TOOLS_LLVM_MCA_DISPATCH_H
#define LLVM_TOOLS_LLVM_MCA_DISPATCH_H

#include "Instruction.h"
#include "llvm/MC/MCRegisterInfo.h"
#include <map>

namespace mca {

class WriteState;
class DispatchUnit;
class Scheduler;
class Backend;

/// \brief Keeps track of register definitions.
///
/// This class tracks register definitions, and performs register renaming
/// to break anti dependencies.
/// By default, there is no limit in the number of register aliases which
/// can be created for the purpose of register renaming. However, users can
/// specify at object construction time a limit in the number of temporary
/// registers which can be used by the register renaming logic.
class RegisterFile {
const llvm::MCRegisterInfo &MRI;
// Currently used mappings and maximum used mappings.
// These are to generate statistics only.
unsigned NumUsedMappings;
unsigned MaxUsedMappings;
// Total number of mappings created over time.
unsigned TotalMappingsCreated;

// The maximum number of register aliases which can be used by the
// register renamer. Defaut value for this field is zero.
// A value of zero for this field means that there is no limit in the
// amount of register mappings which can be created. That is equivalent
// to having a theoretically infinite number of temporary registers.
unsigned TotalMappings;

// This map contains an entry for every physical register.
// A register index is used as a key value to access a WriteState.
// This is how we track RAW dependencies for dispatched
// instructions. For every register, we track the last seen write only.
// This assumes that all writes fully update both super and sub registers.
// We need a flag in MCInstrDesc to check if a write also updates super
// registers. We can then have a extra tablegen flag to set for instructions.
// This is a separate patch on its own.
std::vector<WriteState *> RegisterMappings;
// Assumptions are:
// a) a false dependencies is always removed by the register renamer.
// b) the register renamer can create an "infinite" number of mappings.
// Since we track the number of mappings created, in future we may
// introduce constraints on the number of mappings that can be created.
// For example, the maximum number of registers that are available for
// register renaming purposes may default to the size of the register file.

// In future, we can extend this design to allow multiple register files, and
// apply different restrictions on the register mappings and the number of
// temporary registers used by mappings.

public:
RegisterFile(const llvm::MCRegisterInfo &mri, unsigned Mappings = 0)
: MRI(mri), NumUsedMappings(0), MaxUsedMappings(0),
TotalMappingsCreated(0), TotalMappings(Mappings),
RegisterMappings(MRI.getNumRegs(), nullptr) {}

// Creates a new register mapping for RegID.
// This reserves a temporary register in the register file.
void addRegisterMapping(WriteState &WS);

// Invalidates register mappings associated to the input WriteState object.
// This releases temporary registers in the register file.
void invalidateRegisterMapping(const WriteState &WS);

bool isAvailable(unsigned NumRegWrites);
void collectWrites(llvm::SmallVectorImpl<WriteState *> &Writes,
unsigned RegID) const;
void updateOnRead(ReadState &RS, unsigned RegID);
unsigned getMaxUsedRegisterMappings() const { return MaxUsedMappings; }
unsigned getTotalRegisterMappingsCreated() const {
return TotalMappingsCreated;
}

#ifndef NDEBUG
void dump() const;
#endif
};

/// \brief tracks which instructions are in-flight (i.e. dispatched but not
/// retired) in the OoO backend.
///
/// This class checks on every cycle if/which instructions can be retired.
/// Instructions are retired in program order.
/// In the event of instruction retired, the DispatchUnit object that owns
/// this RetireControlUnit gets notified.
/// On instruction retired, register updates are all architecturally
/// committed, and any temporary registers originally allocated for the
/// retired instruction are freed.
struct RetireControlUnit {
// A "token" (object of class RUToken) is created by the retire unit for every
// instruction dispatched to the schedulers. Flag 'Executed' is used to
// quickly check if an instruction has reached the write-back stage. A token
// also carries information related to the number of entries consumed by the
// instruction in the reorder buffer. The idea is that those entries will
// become available again once the instruction is retired. On every cycle,
// the RCU (Retire Control Unit) scans every token starting to search for
// instructions that are ready to retire. retired. Instructions are retired
// in program order. Only 'Executed' instructions are eligible for retire.
// Note that the size of the reorder buffer is defined by the scheduling model
// via field 'NumMicroOpBufferSize'.
struct RUToken {
unsigned Index; // Instruction index.
unsigned NumSlots; // Slots reserved to this instruction.
bool Executed; // True if the instruction is past the WB stage.
};

private:
unsigned NextAvailableSlotIdx;
unsigned CurrentInstructionSlotIdx;
unsigned AvailableSlots;
unsigned MaxRetirePerCycle; // 0 means no limit.
std::vector<RUToken> Queue;
DispatchUnit *Owner;

public:
RetireControlUnit(unsigned NumSlots, unsigned RPC, DispatchUnit *DU)
: NextAvailableSlotIdx(0), CurrentInstructionSlotIdx(0),
AvailableSlots(NumSlots), MaxRetirePerCycle(RPC), Owner(DU) {
assert(NumSlots && "Expected at least one slot!");
Queue.resize(NumSlots);
}

bool isFull() const { return !AvailableSlots; }
bool isEmpty() const { return AvailableSlots == Queue.size(); }
bool isAvailable(unsigned Quantity = 1) const {
// Some instructions may declare a number of uOps which exceedes the size
// of the reorder buffer. To avoid problems, cap the amount of slots to
// the size of the reorder buffer.
Quantity = std::min(Quantity, static_cast<unsigned>(Queue.size()));
return AvailableSlots >= Quantity;
}

// Reserves a number of slots, and returns a new token.
unsigned reserveSlot(unsigned Index, unsigned NumMicroOps);

/// Retires instructions in program order.
void cycleEvent();

void onInstructionExecuted(unsigned TokenID);

#ifndef NDEBUG
void dump() const;
#endif
};

// \brief Implements the hardware dispatch logic.
//
// This class is responsible for the dispatch stage, in which instructions are
// dispatched in groups to the Scheduler. An instruction can be dispatched if
// functional units are available.
// To be more specific, an instruction can be dispatched to the Scheduler if:
// 1) There are enough entries in the reorder buffer (implemented by class
// RetireControlUnit) to accomodate all opcodes.
// 2) There are enough temporaries to rename output register operands.
// 3) There are enough entries available in the used buffered resource(s).
//
// The number of micro opcodes that can be dispatched in one cycle is limited by
// the value of field 'DispatchWidth'. A "dynamic dispatch stall" occurs when
// processor resources are not available (i.e. at least one of the
// abovementioned checks fails). Dispatch stall events are counted during the
// entire execution of the code, and displayed by the performance report when
// flag '-verbose' is specified.
//
// If the number of micro opcodes of an instruction is bigger than
// DispatchWidth, then it can only be dispatched at the beginning of one cycle.
// The DispatchUnit will still have to wait for a number of cycles (depending on
// the DispatchWidth and the number of micro opcodes) before it can serve other
// instructions.
class DispatchUnit {
unsigned DispatchWidth;
unsigned AvailableEntries;
unsigned CarryOver;
Scheduler *SC;

std::unique_ptr<RegisterFile> RAT;
std::unique_ptr<RetireControlUnit> RCU;
Backend *Owner;

/// Dispatch stall event identifiers.
///
/// The naming convention is:
/// * Event names starts with the "DS_" prefix
/// * For dynamic dispatch stalls, the "DS_" prefix is followed by the
/// the unavailable resource/functional unit acronym (example: RAT)
/// * The last substring is the event reason (example: REG_UNAVAILABLE means
/// that register renaming couldn't find enough spare registers in the
/// register file).
///
/// List of acronyms used for processor resoures:
/// RAT - Register Alias Table (used by the register renaming logic)
/// RCU - Retire Control Unit
/// SQ - Scheduler's Queue
/// LDQ - Load Queue
/// STQ - Store Queue
enum {
DS_RAT_REG_UNAVAILABLE,
DS_RCU_TOKEN_UNAVAILABLE,
DS_SQ_TOKEN_UNAVAILABLE,
DS_LDQ_TOKEN_UNAVAILABLE,
DS_STQ_TOKEN_UNAVAILABLE,
DS_DISPATCH_GROUP_RESTRICTION,
DS_LAST
};

// The DispatchUnit track dispatch stall events caused by unavailable
// of hardware resources. Events are classified based on the stall kind;
// so we have a counter for every source of dispatch stall. Counters are
// stored into a vector `DispatchStall` which is always of size DS_LAST.
std::vector<unsigned> DispatchStalls;

bool checkRAT(const InstrDesc &Desc);
bool checkRCU(const InstrDesc &Desc);
bool checkScheduler(const InstrDesc &Desc);

void notifyInstructionDispatched(unsigned IID);

public:
DispatchUnit(Backend *B, const llvm::MCRegisterInfo &MRI,
unsigned MicroOpBufferSize, unsigned RegisterFileSize,
unsigned MaxRetirePerCycle, unsigned MaxDispatchWidth,
Scheduler *Sched)
: DispatchWidth(MaxDispatchWidth), AvailableEntries(MaxDispatchWidth),
CarryOver(0U), SC(Sched),
RAT(llvm::make_unique<RegisterFile>(MRI, RegisterFileSize)),
RCU(llvm::make_unique<RetireControlUnit>(MicroOpBufferSize,
MaxRetirePerCycle, this)),
Owner(B), DispatchStalls(DS_LAST, 0) {}

unsigned getDispatchWidth() const { return DispatchWidth; }

bool isAvailable(unsigned NumEntries) const {
return NumEntries <= AvailableEntries || AvailableEntries == DispatchWidth;
}

bool isRCUEmpty() const { return RCU->isEmpty(); }

bool canDispatch(const InstrDesc &Desc) {
assert(isAvailable(Desc.NumMicroOps));
return checkRCU(Desc) && checkRAT(Desc) && checkScheduler(Desc);
}

unsigned dispatch(unsigned IID, Instruction *NewInst);

void collectWrites(llvm::SmallVectorImpl<WriteState *> &Vec,
unsigned RegID) const {
return RAT->collectWrites(Vec, RegID);
}
unsigned getNumRATStalls() const {
return DispatchStalls[DS_RAT_REG_UNAVAILABLE];
}
unsigned getNumRCUStalls() const {
return DispatchStalls[DS_RCU_TOKEN_UNAVAILABLE];
}
unsigned getNumSQStalls() const {
return DispatchStalls[DS_SQ_TOKEN_UNAVAILABLE];
}
unsigned getNumLDQStalls() const {
return DispatchStalls[DS_LDQ_TOKEN_UNAVAILABLE];
}
unsigned getNumSTQStalls() const {
return DispatchStalls[DS_STQ_TOKEN_UNAVAILABLE];
}
unsigned getNumDispatchGroupStalls() const {
return DispatchStalls[DS_DISPATCH_GROUP_RESTRICTION];
}
unsigned getMaxUsedRegisterMappings() const {
return RAT->getMaxUsedRegisterMappings();
}
unsigned getTotalRegisterMappingsCreated() const {
return RAT->getTotalRegisterMappingsCreated();
}
void addNewRegisterMapping(WriteState &WS) { RAT->addRegisterMapping(WS); }

void cycleEvent(unsigned Cycle) {
RCU->cycleEvent();
AvailableEntries =
CarryOver >= DispatchWidth ? 0 : DispatchWidth - CarryOver;
CarryOver = CarryOver >= DispatchWidth ? CarryOver - DispatchWidth : 0U;
}

void notifyInstructionRetired(unsigned Index);

void onInstructionExecuted(unsigned TokenID) {
RCU->onInstructionExecuted(TokenID);
}

void invalidateRegisterMappings(const Instruction &Inst);
#ifndef NDEBUG
void dump() const;
#endif
};

} // namespace mca

#endif
22 changes: 22 additions & 0 deletions llvm/tools/llvm-mca/HWEventListener.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
//===----------------------- HWEventListener.cpp ----------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
/// \file
///
/// This file defines a vtable anchor for struct HWEventListener.
///
//===----------------------------------------------------------------------===//

#include "HWEventListener.h"

namespace mca {

// Anchor the vtable here.
void HWEventListener::anchor() {}

} // namespace mca
50 changes: 50 additions & 0 deletions llvm/tools/llvm-mca/HWEventListener.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@

//===----------------------- HWEventListener.h ------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
/// \file
///
/// This file defines the main interface for hardware event listeners.
///
//===----------------------------------------------------------------------===//

#ifndef LLVM_TOOLS_LLVM_MCA_HWEVENTLISTENER_H
#define LLVM_TOOLS_LLVM_MCA_HWEVENTLISTENER_H

#include "llvm/ADT/ArrayRef.h"
#include <utility>

namespace mca {

struct HWEventListener {
// Events generated by the Retire Control Unit.
virtual void onInstructionRetired(unsigned Index) {};

// Events generated by the Scheduler.
using ResourceRef = std::pair<uint64_t, uint64_t>;
virtual void
onInstructionIssued(unsigned Index,
const llvm::ArrayRef<std::pair<ResourceRef, unsigned>> &Used) {}
virtual void onInstructionExecuted(unsigned Index) {}
virtual void onInstructionReady(unsigned Index) {}
virtual void onResourceAvailable(const ResourceRef &RRef) {};

// Events generated by the Dispatch logic.
virtual void onInstructionDispatched(unsigned Index) {}

// Generic events generated by the Backend.
virtual void onCycleBegin(unsigned Cycle) {}
virtual void onCycleEnd(unsigned Cycle) {}

virtual ~HWEventListener() = default;
virtual void anchor();
};

} // namespace mca

#endif
525 changes: 525 additions & 0 deletions llvm/tools/llvm-mca/InstrBuilder.cpp

Large diffs are not rendered by default.

62 changes: 62 additions & 0 deletions llvm/tools/llvm-mca/InstrBuilder.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
//===--------------------- InstrBuilder.h -----------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
/// \file
///
/// A builder class for instructions that are statically analyzed by llvm-mca.
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_TOOLS_LLVM_MCA_INSTRBUILDER_H
#define LLVM_TOOLS_LLVM_MCA_INSTRBUILDER_H

#include "Dispatch.h"
#include "Instruction.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"

namespace mca {

class DispatchUnit;

/// \brief A builder class that knows how to construct Instruction objects.
///
/// Every llvm-mca Instruction is described by an object of class InstrDesc.
/// An InstrDesc describes which registers are read/written by the instruction,
/// as well as the instruction latency and hardware resources consumed.
///
/// This class is used by the tool to construct Instructions and instruction
/// descriptors (i.e. InstrDesc objects).
/// Information from the machine scheduling model is used to identify processor
/// resources that are consumed by an instruction.
class InstrBuilder {
const llvm::MCInstrInfo &MCII;
const llvm::ArrayRef<uint64_t> ProcResourceMasks;

llvm::DenseMap<unsigned short, std::unique_ptr<const InstrDesc>> Descriptors;
llvm::DenseMap<unsigned, std::unique_ptr<Instruction>> Instructions;

void createInstrDescImpl(const llvm::MCSubtargetInfo &STI,
const llvm::MCInst &MCI);

public:
InstrBuilder(const llvm::MCInstrInfo &mcii,
const llvm::ArrayRef<uint64_t> Masks)
: MCII(mcii), ProcResourceMasks(Masks) {}

const InstrDesc &getOrCreateInstrDesc(const llvm::MCSubtargetInfo &STI,
const llvm::MCInst &MCI);

Instruction *createInstruction(const llvm::MCSubtargetInfo &STI,
DispatchUnit &DU, unsigned Idx,
const llvm::MCInst &MCI);
};

} // namespace mca

#endif
134 changes: 134 additions & 0 deletions llvm/tools/llvm-mca/Instruction.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
//===--------------------- Instruction.cpp ----------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file defines abstractions used by the Backend to model register reads,
// register writes and instructions.
//
//===----------------------------------------------------------------------===//

#include "Instruction.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"

namespace mca {

using namespace llvm;

void ReadState::writeStartEvent(unsigned Cycles) {
assert(DependentWrites);
assert(CyclesLeft == UNKNOWN_CYCLES);

// This read may be dependent on more than one write. This typically occurs
// when a definition is the result of multiple writes where at least one
// write does a partial register update.
// The HW is forced to do some extra bookkeeping to track of all the
// dependent writes, and implement a merging scheme for the partial writes.
--DependentWrites;
TotalCycles = std::max(TotalCycles, Cycles);

if (!DependentWrites)
CyclesLeft = TotalCycles;
}

void WriteState::onInstructionIssued() {
assert(CyclesLeft == UNKNOWN_CYCLES);
// Update the number of cycles left based on the WriteDescriptor info.
CyclesLeft = WD.Latency;

// Now that the time left before write-back is know, notify
// all the users.
for (const std::pair<ReadState *, int> &User : Users) {
ReadState *RS = User.first;
unsigned ReadCycles = std::max(0, CyclesLeft - User.second);
RS->writeStartEvent(ReadCycles);
}
}

void WriteState::addUser(ReadState *User, int ReadAdvance) {
// If CyclesLeft is different than -1, then we don't need to
// update the list of users. We can just notify the user with
// the actual number of cycles left (which may be zero).
if (CyclesLeft != UNKNOWN_CYCLES) {
unsigned ReadCycles = std::max(0, CyclesLeft - ReadAdvance);
User->writeStartEvent(ReadCycles);
return;
}

std::pair<ReadState *, int> NewPair(User, ReadAdvance);
Users.insert(NewPair);
}

void WriteState::cycleEvent() {
// Note: CyclesLeft can be a negative number. It is an error to
// make it an unsigned quantity because users of this write may
// specify a negative ReadAdvance.
if (CyclesLeft != UNKNOWN_CYCLES)
CyclesLeft--;
}

void ReadState::cycleEvent() {
// If CyclesLeft is unknown, then bail out immediately.
if (CyclesLeft == UNKNOWN_CYCLES)
return;

// If there are still dependent writes, or we reached cycle zero,
// then just exit.
if (DependentWrites || CyclesLeft == 0)
return;

CyclesLeft--;
}

#ifndef NDEBUG
void WriteState::dump() const {
dbgs() << "{ OpIdx=" << WD.OpIndex << ", Lat=" << WD.Latency << ", RegID "
<< getRegisterID() << ", Cycles Left=" << getCyclesLeft() << " }\n";
}
#endif

bool Instruction::isReady() {
if (Stage == IS_READY)
return true;

assert(Stage == IS_AVAILABLE);
for (const UniqueUse &Use : Uses)
if (!Use.get()->isReady())
return false;

setReady();
return true;
}

void Instruction::execute() {
assert(Stage == IS_READY);
Stage = IS_EXECUTING;
for (UniqueDef &Def : Defs)
Def->onInstructionIssued();
}

bool Instruction::isZeroLatency() const {
return Desc.MaxLatency == 0 && Defs.size() == 0 && Uses.size() == 0;
}

void Instruction::cycleEvent() {
if (isDispatched()) {
for (UniqueUse &Use : Uses)
Use->cycleEvent();
return;
}
if (isExecuting()) {
for (UniqueDef &Def : Defs)
Def->cycleEvent();
CyclesLeft--;
}
if (!CyclesLeft)
Stage = IS_EXECUTED;
}

} // namespace mca
336 changes: 336 additions & 0 deletions llvm/tools/llvm-mca/Instruction.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,336 @@
//===--------------------- Instruction.h ------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
/// \file
///
/// This file defines abstractions used by the Backend to model register reads,
/// register writes and instructions.
///
//===----------------------------------------------------------------------===//

#ifndef LLVM_TOOLS_LLVM_MCA_INSTRUCTION_H
#define LLVM_TOOLS_LLVM_MCA_INSTRUCTION_H

#include "llvm/Support/MathExtras.h"
#include <memory>
#include <set>
#include <vector>

namespace mca {

struct WriteDescriptor;
struct ReadDescriptor;
class WriteState;
class ReadState;

constexpr int UNKNOWN_CYCLES = -512;

/// \brief A register write descriptor.
struct WriteDescriptor {
int OpIndex; // Operand index. -1 if this is an implicit write.
// Write latency. Number of cycles before write-back stage.
int Latency;
// This field is set to a value different than zero only if this
// is an implicit definition.
unsigned RegisterID;
// True if this write generates a partial update of a super-registers.
// On X86, this flag is set by byte/word writes on GPR registers. Also,
// a write of an XMM register only partially updates the corresponding
// YMM super-register if the write is associated to a legacy SSE instruction.
bool FullyUpdatesSuperRegs;
// Instruction itineraries would set this field to the SchedClass ID.
// Otherwise, it defaults to the WriteResourceID from teh MCWriteLatencyEntry
// element associated to this write.
// When computing read latencies, this value is matched against the
// "ReadAdvance" information. The hardware backend may implement
// dedicated forwarding paths to quickly propagate write results to dependent
// instructions waiting in the reservation station (effectively bypassing the
// write-back stage).
unsigned SClassOrWriteResourceID;
// True only if this is a write obtained from an optional definition.
// Optional definitions are allowed to reference regID zero (i.e. "no
// register").
bool IsOptionalDef;
};

/// \brief A register read descriptor.
struct ReadDescriptor {
// This field defaults to -1 if this is an implicit read.
int OpIndex;
// This field is only set if this is an implicit read.
unsigned RegisterID;
// Scheduling Class Index. It is used to query the scheduling model for the
// MCSchedClassDesc object.
unsigned SchedClassID;
// True if there may be a local forwarding logic in hardware to serve a
// write used by this read. This information, along with SchedClassID, is
// used to dynamically check at Instruction creation time, if the input
// operands can benefit from a ReadAdvance bonus.
bool HasReadAdvanceEntries;
};

/// \brief Tracks uses of a register definition (e.g. register write).
///
/// Each implicit/explicit register write is associated with an instance of
/// this class. A WriteState object tracks the dependent users of a
/// register write. It also tracks how many cycles are left before the write
/// back stage.
class WriteState {
const WriteDescriptor &WD;
// On instruction issue, this field is set equal to the write latency.
// Before instruction issue, this field defaults to -512, a special
// value that represents an "unknown" number of cycles.
int CyclesLeft;

// Actual register defined by this write. This field is only used
// to speedup queries on the register file.
// For implicit writes, this field always matches the value of
// field RegisterID from WD.
unsigned RegisterID;

// A list of dependent reads. Users is a set of dependent
// reads. A dependent read is added to the set only if CyclesLeft
// is "unknown". As soon as CyclesLeft is 'known', each user in the set
// gets notified with the actual CyclesLeft.

// The 'second' element of a pair is a "ReadAdvance" number of cycles.
std::set<std::pair<ReadState *, int>> Users;

public:
WriteState(const WriteDescriptor &Desc)
: WD(Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(Desc.RegisterID) {}
WriteState(const WriteState &Other) = delete;
WriteState &operator=(const WriteState &Other) = delete;

int getCyclesLeft() const { return CyclesLeft; }
unsigned getWriteResourceID() const { return WD.SClassOrWriteResourceID; }
unsigned getRegisterID() const { return RegisterID; }
void setRegisterID(unsigned ID) { RegisterID = ID; }

void addUser(ReadState *Use, int ReadAdvance);
bool fullyUpdatesSuperRegs() const { return WD.FullyUpdatesSuperRegs; }
bool isWrittenBack() const { return CyclesLeft == 0; }

// On every cycle, update CyclesLeft and notify dependent users.
void cycleEvent();
void onInstructionIssued();

#ifndef NDEBUG
void dump() const;
#endif
};

/// \brief Tracks register operand latency in cycles.
///
/// A read may be dependent on more than one write. This occurs when some
/// writes only partially update the register associated to this read.
class ReadState {
const ReadDescriptor &RD;
unsigned DependentWrites;
int CyclesLeft;
unsigned TotalCycles;

public:
bool isReady() const {
if (DependentWrites)
return false;
return (CyclesLeft == UNKNOWN_CYCLES || CyclesLeft == 0);
}

ReadState(const ReadDescriptor &Desc)
: RD(Desc), DependentWrites(0), CyclesLeft(UNKNOWN_CYCLES),
TotalCycles(0) {}
ReadState(const ReadState &Other) = delete;
ReadState &operator=(const ReadState &Other) = delete;

const ReadDescriptor &getDescriptor() const { return RD; }
unsigned getSchedClass() const { return RD.SchedClassID; }
void cycleEvent();
void writeStartEvent(unsigned Cycles);
void setDependentWrites(unsigned Writes) { DependentWrites = Writes; }
};

/// \brief A sequence of cycles.
///
/// This class can be used as a building block to construct ranges of cycles.
class CycleSegment {
unsigned Begin; // Inclusive.
unsigned End; // Exclusive.
bool Reserved; // Resources associated to this segment must be reserved.

public:
CycleSegment(unsigned StartCycle, unsigned EndCycle, bool IsReserved = false)
: Begin(StartCycle), End(EndCycle), Reserved(IsReserved) {}

bool contains(unsigned Cycle) const { return Cycle >= Begin && Cycle < End; }
bool startsAfter(const CycleSegment &CS) const { return End <= CS.Begin; }
bool endsBefore(const CycleSegment &CS) const { return Begin >= CS.End; }
bool overlaps(const CycleSegment &CS) const {
return !startsAfter(CS) && !endsBefore(CS);
}
bool isExecuting() const { return Begin == 0 && End != 0; }
bool isExecuted() const { return End == 0; }
bool operator<(const CycleSegment &Other) const {
return Begin < Other.Begin;
}
CycleSegment &operator--(void) {
if (Begin)
Begin--;
if (End)
End--;
return *this;
}

bool isValid() const { return Begin <= End; }
unsigned size() const { return End - Begin; };
void Subtract(unsigned Cycles) {
assert(End >= Cycles);
End -= Cycles;
}

unsigned begin() const { return Begin; }
unsigned end() const { return End; }
void setEnd(unsigned NewEnd) { End = NewEnd; }
bool isReserved() const { return Reserved; }
void setReserved() { Reserved = true; }
};

/// \brief Helper used by class InstrDesc to describe how hardware resources
/// are used.
///
/// This class describes how many resource units of a specific resource kind
/// (and how many cycles) are "used" by an instruction.
struct ResourceUsage {
CycleSegment CS;
unsigned NumUnits;
ResourceUsage(CycleSegment Cycles, unsigned Units = 1)
: CS(Cycles), NumUnits(Units) {}
unsigned size() const { return CS.size(); }
bool isReserved() const { return CS.isReserved(); }
void setReserved() { CS.setReserved(); }
};

/// \brief An instruction descriptor
struct InstrDesc {
std::vector<WriteDescriptor> Writes; // Implicit writes are at the end.
std::vector<ReadDescriptor> Reads; // Implicit reads are at the end.

// For every resource used by an instruction of this kind, this vector
// reports the number of "consumed cycles".
std::vector<std::pair<uint64_t, ResourceUsage>> Resources;

// A list of buffered resources consumed by this instruction.
std::vector<uint64_t> Buffers;
unsigned MaxLatency;
// Number of MicroOps for this instruction.
unsigned NumMicroOps;

bool MayLoad;
bool MayStore;
bool HasSideEffects;
};

/// An instruction dispatched to the out-of-order backend.
///
/// This class is used to monitor changes in the internal state of instructions
/// that are dispatched by the DispatchUnit to the hardware schedulers.
class Instruction {
const InstrDesc &Desc;

enum InstrStage {
IS_INVALID, // Instruction in an invalid state.
IS_AVAILABLE, // Instruction dispatched but operands are not ready.
IS_READY, // Instruction dispatched and operands ready.
IS_EXECUTING, // Instruction issued.
IS_EXECUTED, // Instruction executed. Values are written back.
IS_RETIRED // Instruction retired.
};

// The current instruction stage.
enum InstrStage Stage;

// This value defaults to the instruction latency. This instruction is
// considered executed when field CyclesLeft goes to zero.
int CyclesLeft;

// Retire Unit token ID for this instruction.
unsigned RCUTokenID;

using UniqueDef = std::unique_ptr<WriteState>;
using UniqueUse = std::unique_ptr<ReadState>;
using VecDefs = std::vector<UniqueDef>;
using VecUses = std::vector<UniqueUse>;

// Output dependencies.
// One entry per each implicit and explicit register definition.
VecDefs Defs;

// Input dependencies.
// One entry per each implicit and explicit register use.
VecUses Uses;

// This instruction has already been dispatched, and all operands are ready.
void setReady() {
assert(Stage == IS_AVAILABLE);
Stage = IS_READY;
}

public:
Instruction(const InstrDesc &D)
: Desc(D), Stage(IS_INVALID), CyclesLeft(-1) {}
Instruction(const Instruction &Other) = delete;
Instruction &operator=(const Instruction &Other) = delete;

VecDefs &getDefs() { return Defs; }
const VecDefs &getDefs() const { return Defs; }
VecUses &getUses() { return Uses; }
const VecUses &getUses() const { return Uses; }
const InstrDesc &getDesc() const { return Desc; }

unsigned getRCUTokenID() const { return RCUTokenID; }
int getCyclesLeft() const { return CyclesLeft; }
void setCyclesLeft(int Cycles) { CyclesLeft = Cycles; }
void setRCUTokenID(unsigned TokenID) { RCUTokenID = TokenID; }

// Transition to the dispatch stage.
// No definition is updated because the instruction is not "executing".
void dispatch() {
assert(Stage == IS_INVALID);
Stage = IS_AVAILABLE;
}

// Instruction issued. Transition to the IS_EXECUTING state, and update
// all the definitions.
void execute();

void forceExecuted() {
assert((Stage == IS_INVALID && isZeroLatency()) ||
(Stage == IS_READY && Desc.MaxLatency == 0));
Stage = IS_EXECUTED;
}

// Checks if operands are available. If all operands area ready,
// then this forces a transition from IS_AVAILABLE to IS_READY.
bool isReady();

bool isDispatched() const { return Stage == IS_AVAILABLE; }
bool isExecuting() const { return Stage == IS_EXECUTING; }
bool isExecuted() const { return Stage == IS_EXECUTED; }
bool isZeroLatency() const;

void retire() {
assert(Stage == IS_EXECUTED);
Stage = IS_RETIRED;
}

void cycleEvent();
};

} // namespace mca

#endif
22 changes: 22 additions & 0 deletions llvm/tools/llvm-mca/LLVMBuild.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
;===- ./tools/llvm-mc/LLVMBuild.txt ----------------------------*- Conf -*--===;
;
; The LLVM Compiler Infrastructure
;
; This file is distributed under the University of Illinois Open Source
; License. See LICENSE.TXT for details.
;
;===------------------------------------------------------------------------===;
;
; This is an LLVMBuild description file for the components in this subdirectory.
;
; For more information on the LLVMBuild system, please see:
;
; http://llvm.org/docs/LLVMBuild.html
;
;===------------------------------------------------------------------------===;

[component_0]
type = Tool
name = llvm-mca
parent = Tools
required_libraries = MC MCParser Support all-targets
115 changes: 115 additions & 0 deletions llvm/tools/llvm-mca/LSUnit.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
//===----------------------- LSUnit.cpp --------------------------*- C++-*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
/// \file
///
/// A Load-Store Unit for the llvm-mca tool.
///
//===----------------------------------------------------------------------===//

#include "LSUnit.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"

using namespace llvm;

#define DEBUG_TYPE "llvm-mca"

namespace mca {

#ifndef NDEBUG
void LSUnit::dump() const {
dbgs() << "[LSUnit] LQ_Size = " << LQ_Size << '\n';
dbgs() << "[LSUnit] SQ_Size = " << SQ_Size << '\n';
dbgs() << "[LSUnit] NextLQSlotIdx = " << LoadQueue.size() << '\n';
dbgs() << "[LSUnit] NextSQSlotIdx = " << StoreQueue.size() << '\n';
}
#endif

void LSUnit::assignLQSlot(unsigned Index) {
assert(!isLQFull());
assert(LoadQueue.count(Index) == 0);

DEBUG(dbgs() << "[LSUnit] - AssignLQSlot <Idx=" << Index
<< ",slot=" << LoadQueue.size() << ">\n");
LoadQueue.insert(Index);
}

void LSUnit::assignSQSlot(unsigned Index) {
assert(!isSQFull());
assert(StoreQueue.count(Index) == 0);

DEBUG(dbgs() << "[LSUnit] - AssignSQSlot <Idx=" << Index
<< ",slot=" << StoreQueue.size() << ">\n");
StoreQueue.insert(Index);
}

bool LSUnit::isReady(unsigned Index) const {
bool IsALoad = LoadQueue.count(Index) != 0;
bool IsAStore = StoreQueue.count(Index) != 0;
unsigned LoadBarrierIndex = LoadBarriers.empty() ? 0 : *LoadBarriers.begin();
unsigned StoreBarrierIndex = StoreBarriers.empty() ? 0 : *StoreBarriers.begin();

if (IsALoad && LoadBarrierIndex) {
if (Index > LoadBarrierIndex)
return false;
if (Index == LoadBarrierIndex && Index != *LoadQueue.begin())
return false;
}

if (IsAStore && StoreBarrierIndex) {
if (Index > StoreBarrierIndex)
return false;
if (Index == StoreBarrierIndex && Index != *StoreQueue.begin())
return false;
}

if (NoAlias && IsALoad)
return true;

if (StoreQueue.size()) {
// Check if this memory operation is younger than the older store.
if (Index > *StoreQueue.begin())
return false;
}

// Okay, we are older than the oldest store in the queue.
// If there are no pending loads, then we can say for sure that this
// instruction is ready.
if (isLQEmpty())
return true;

// Check if there are no older loads.
if (Index <= *LoadQueue.begin())
return true;

// There is at least one younger load.
return !IsAStore;
}

void LSUnit::onInstructionExecuted(unsigned Index) {
std::set<unsigned>::iterator it = LoadQueue.find(Index);
if (it != LoadQueue.end()) {
DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index
<< " has been removed from the load queue.\n");
LoadQueue.erase(it);
}

it = StoreQueue.find(Index);
if (it != StoreQueue.end()) {
DEBUG(dbgs() << "[LSUnit]: Instruction idx=" << Index
<< " has been removed from the store queue.\n");
StoreQueue.erase(it);
}

if (!StoreBarriers.empty() && Index == *StoreBarriers.begin())
StoreBarriers.erase(StoreBarriers.begin());
if (!LoadBarriers.empty() && Index == *LoadBarriers.begin())
LoadBarriers.erase(LoadBarriers.begin());
}
} // namespace mca
160 changes: 160 additions & 0 deletions llvm/tools/llvm-mca/LSUnit.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
//===------------------------- LSUnit.h --------------------------*- C++-*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
/// \file
///
/// A Load/Store unit class that models load/store queues and that implements
/// a simple weak memory consistency model.
///
//===----------------------------------------------------------------------===//

#ifndef LLVM_TOOLS_LLVM_MCA_LSUNIT_H
#define LLVM_TOOLS_LLVM_MCA_LSUNIT_H

#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include <set>

#define DEBUG_TYPE "llvm-mca"

namespace mca {

/// \brief A Load/Store Unit implementing a load and store queues.
///
/// This class implements a load queue and a store queue to emulate the
/// out-of-order execution of memory operations.
/// Each load (or store) consumes an entry in the load (or store) queue.
///
/// Rules are:
/// 1) A younger load is allowed to pass an older load only if there are no
/// stores nor barriers in between the two loads.
/// 2) An younger store is not allowed to pass an older store.
/// 3) A younger store is not allowed to pass an older load.
/// 4) A younger load is allowed to pass an older store only if the load does
/// not alias with the store.
///
/// This class optimistically assumes that loads don't alias store operations.
/// Under this assumption, younger loads are always allowed to pass older
/// stores (this would only affects rule 4).
/// Essentially, this LSUnit doesn't attempt to run any sort alias analysis to
/// predict when loads and stores don't alias with eachother.
///
/// To enforce aliasing between loads and stores, flag `AssumeNoAlias` must be
/// set to `false` by the constructor of LSUnit.
///
/// In the case of write-combining memory, rule 2. could be relaxed to allow
/// reordering of non-aliasing store operations. At the moment, this is not
/// allowed.
/// To put it in another way, there is no option to specify a different memory
/// type for memory operations (example: write-through, write-combining, etc.).
/// Also, there is no way to weaken the memory model, and this unit currently
/// doesn't support write-combining behavior.
///
/// No assumptions are made on the size of the store buffer.
/// As mentioned before, this class doesn't perform alias analysis.
/// Consequently, LSUnit doesn't know how to identify cases where
/// store-to-load forwarding may occur.
///
/// LSUnit doesn't attempt to predict whether a load or store hits or misses
/// the L1 cache. To be more specific, LSUnit doesn't know anything about
/// the cache hierarchy and memory types.
/// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the
/// scheduling model provides an "optimistic" load-to-use latency (which usually
/// matches the load-to-use latency for when there is a hit in the L1D).
///
/// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor
/// memory-barrier like instructions.
/// LSUnit conservatively assumes that an instruction which `mayLoad` and has
/// `unmodeled side effects` behave like a "soft" load-barrier. That means, it
/// serializes loads without forcing a flush of the load queue.
/// Similarly, instructions that both `mayStore` and have `unmodeled side
/// effects` are treated like store barriers. A full memory
/// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side
/// effects. This is obviously inaccurate, but this is the best that we can do
/// at the moment.
///
/// Each load/store barrier consumes one entry in the load/store queue. A
/// load/store barrier enforces ordering of loads/stores:
/// - A younger load cannot pass a load barrier.
/// - A younger store cannot pass a store barrier.
///
/// A younger load has to wait for the memory load barrier to execute.
/// A load/store barrier is "executed" when it becomes the oldest entry in
/// the load/store queue(s). That also means, all the older loads/stores have
/// already been executed.
class LSUnit {
// Load queue size.
// LQ_Size == 0 means that there are infinite slots in the load queue.
unsigned LQ_Size;

// Store queue size.
// SQ_Size == 0 means that there are infinite slots in the store queue.
unsigned SQ_Size;

// If true, loads will never alias with stores. This is the default.
bool NoAlias;

std::set<unsigned> LoadQueue;
std::set<unsigned> StoreQueue;

void assignLQSlot(unsigned Index);
void assignSQSlot(unsigned Index);
bool isReadyNoAlias(unsigned Index) const;

// An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is
// conservatively treated as a store barrier. It forces older store to be
// executed before newer stores are issued.
std::set<unsigned> StoreBarriers;

// An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is
// conservatively treated as a load barrier. It forces older loads to execute
// before newer loads are issued.
std::set<unsigned> LoadBarriers;

public:
LSUnit(unsigned LQ = 0, unsigned SQ = 0, bool AssumeNoAlias = false)
: LQ_Size(LQ), SQ_Size(SQ), NoAlias(AssumeNoAlias) {}

#ifndef NDEBUG
void dump() const;
#endif

bool isSQEmpty() const { return StoreQueue.empty(); }
bool isLQEmpty() const { return LoadQueue.empty(); }
bool isSQFull() const { return SQ_Size != 0 && StoreQueue.size() == SQ_Size; }
bool isLQFull() const { return LQ_Size != 0 && LoadQueue.size() == LQ_Size; }

void reserve(unsigned Index, bool MayLoad, bool MayStore, bool IsMemBarrier) {
if (!MayLoad && !MayStore)
return;
if (MayLoad) {
if (IsMemBarrier)
LoadBarriers.insert(Index);
assignLQSlot(Index);
}
if (MayStore) {
if (IsMemBarrier)
StoreBarriers.insert(Index);
assignSQSlot(Index);
}
}

// The rules are:
// 1. A store may not pass a previous store.
// 2. A load may not pass a previous store unless flag 'NoAlias' is set.
// 3. A load may pass a previous load.
// 4. A store may not pass a previous load (regardless of flag 'NoAlias').
// 5. A load has to wait until an older load barrier is fully executed.
// 6. A store has to wait until an older store barrier is fully executed.
bool isReady(unsigned Index) const;
void onInstructionExecuted(unsigned Index);
};

} // namespace mca

#endif
Loading