Skip to content

Commit

Permalink
[llvm-mca][BtVer2] Teach how to identify dependency-breaking idioms.
Browse files Browse the repository at this point in the history
This patch teaches llvm-mca how to identify dependency breaking instructions on
btver2.

An example of dependency breaking instructions is the zero-idiom XOR (example:
`XOR %eax, %eax`), which always generates zero regardless of the actual value of
the input register operands.
Dependency breaking instructions don't have to wait on their input register
operands before executing. This is because the computation is not dependent on
the inputs.

Not all dependency breaking idioms are also zero-latency instructions. For
example, `CMPEQ %xmm1, %xmm1` is independent on
the value of XMM1, and it generates a vector of all-ones.
That instruction is not eliminated at register renaming stage, and its opcode is
issued to a pipeline for execution. So, the latency is not zero. 

This patch adds a new method named isDependencyBreaking() to the MCInstrAnalysis
interface. That method takes as input an instruction (i.e. MCInst) and a
MCSubtargetInfo.
The default implementation of isDependencyBreaking() conservatively returns
false for all instructions. Targets may override the default behavior for
specific CPUs, and return a value which better matches the subtarget behavior.

In future, we should teach to Tablegen how to automatically generate the body of
isDependencyBreaking from scheduling predicate definitions. This would allow us
to expose the knowledge about dependency breaking instructions to the machine
schedulers (and, potentially, other codegen passes).

Differential Revision: https://reviews.llvm.org/D49310

llvm-svn: 338372
  • Loading branch information
Andrea Di Biagio authored and Andrea Di Biagio committed Jul 31, 2018
1 parent c5018d4 commit a1852b6
Show file tree
Hide file tree
Showing 11 changed files with 224 additions and 114 deletions.
13 changes: 13 additions & 0 deletions llvm/include/llvm/MC/MCInstrAnalysis.h
Expand Up @@ -87,6 +87,19 @@ class MCInstrAnalysis {
const MCInst &Inst,
APInt &Writes) const;

/// Returns true if \param Inst is a dependency breaking instruction for the
/// given subtarget.
///
/// The value computed by a dependency breaking instruction is not dependent
/// on the inputs. An example of dependency breaking instruction on X86 is
/// `XOR %eax, %eax`.
/// TODO: In future, we could implement an alternative approach where this
/// method returns `true` if the input instruction is not dependent on
/// some/all of its input operands. An APInt mask could then be used to
/// identify independent operands.
virtual bool isDependencyBreaking(const MCSubtargetInfo &STI,
const MCInst &Inst) const;

/// Given a branch instruction try to get the address the branch
/// targets. Return true on success, and the address in Target.
virtual bool
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/MC/MCInstrAnalysis.cpp
Expand Up @@ -24,6 +24,11 @@ bool MCInstrAnalysis::clearsSuperRegisters(const MCRegisterInfo &MRI,
return false;
}

bool MCInstrAnalysis::isDependencyBreaking(const MCSubtargetInfo &STI,
const MCInst &Inst) const {
return false;
}

bool MCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr,
uint64_t Size, uint64_t &Target) const {
if (Inst.getNumOperands() == 0 ||
Expand Down
74 changes: 74 additions & 0 deletions llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
Expand Up @@ -307,10 +307,84 @@ class X86MCInstrAnalysis : public MCInstrAnalysis {
public:
X86MCInstrAnalysis(const MCInstrInfo *MCII) : MCInstrAnalysis(MCII) {}

bool isDependencyBreaking(const MCSubtargetInfo &STI,
const MCInst &Inst) const override;
bool clearsSuperRegisters(const MCRegisterInfo &MRI, const MCInst &Inst,
APInt &Mask) const override;
};

bool X86MCInstrAnalysis::isDependencyBreaking(const MCSubtargetInfo &STI,
const MCInst &Inst) const {
if (STI.getCPU() == "btver2") {
// Reference: Agner Fog's microarchitecture.pdf - Section 20 "AMD Bobcat and
// Jaguar pipeline", subsection 8 "Dependency-breaking instructions".
switch (Inst.getOpcode()) {
default:
return false;
case X86::SUB32rr:
case X86::SUB64rr:
case X86::SBB32rr:
case X86::SBB64rr:
case X86::XOR32rr:
case X86::XOR64rr:
case X86::XORPSrr:
case X86::XORPDrr:
case X86::VXORPSrr:
case X86::VXORPDrr:
case X86::ANDNPSrr:
case X86::VANDNPSrr:
case X86::ANDNPDrr:
case X86::VANDNPDrr:
case X86::PXORrr:
case X86::VPXORrr:
case X86::PANDNrr:
case X86::VPANDNrr:
case X86::PSUBBrr:
case X86::PSUBWrr:
case X86::PSUBDrr:
case X86::PSUBQrr:
case X86::VPSUBBrr:
case X86::VPSUBWrr:
case X86::VPSUBDrr:
case X86::VPSUBQrr:
case X86::PCMPEQBrr:
case X86::PCMPEQWrr:
case X86::PCMPEQDrr:
case X86::PCMPEQQrr:
case X86::VPCMPEQBrr:
case X86::VPCMPEQWrr:
case X86::VPCMPEQDrr:
case X86::VPCMPEQQrr:
case X86::PCMPGTBrr:
case X86::PCMPGTWrr:
case X86::PCMPGTDrr:
case X86::PCMPGTQrr:
case X86::VPCMPGTBrr:
case X86::VPCMPGTWrr:
case X86::VPCMPGTDrr:
case X86::VPCMPGTQrr:
case X86::MMX_PXORirr:
case X86::MMX_PANDNirr:
case X86::MMX_PSUBBirr:
case X86::MMX_PSUBDirr:
case X86::MMX_PSUBQirr:
case X86::MMX_PSUBWirr:
case X86::MMX_PCMPGTBirr:
case X86::MMX_PCMPGTDirr:
case X86::MMX_PCMPGTWirr:
case X86::MMX_PCMPEQBirr:
case X86::MMX_PCMPEQDirr:
case X86::MMX_PCMPEQWirr:
return Inst.getOperand(1).getReg() == Inst.getOperand(2).getReg();
case X86::CMP32rr:
case X86::CMP64rr:
return Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg();
}
}

return false;
}

bool X86MCInstrAnalysis::clearsSuperRegisters(const MCRegisterInfo &MRI,
const MCInst &Inst,
APInt &Mask) const {
Expand Down
22 changes: 11 additions & 11 deletions llvm/test/tools/llvm-mca/X86/BtVer2/dependency-breaking-cmp.s
Expand Up @@ -11,9 +11,9 @@ cmovae %ebx, %eax

# CHECK: Iterations: 1500
# CHECK-NEXT: Instructions: 3000
# CHECK-NEXT: Total Cycles: 3003
# CHECK-NEXT: Total Cycles: 1504
# CHECK-NEXT: Dispatch Width: 2
# CHECK-NEXT: IPC: 1.00
# CHECK-NEXT: IPC: 1.99
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK: Instruction Info:
Expand Down Expand Up @@ -54,14 +54,14 @@ cmovae %ebx, %eax
# CHECK-NEXT: 1.00 - - - - - - - - - - - - - cmovael %ebx, %eax

# CHECK: Timeline view:
# CHECK-NEXT: Index 012345678
# CHECK-NEXT: Index 0123456

# CHECK: [0,0] DeER . . cmpl %eax, %eax
# CHECK-NEXT: [0,1] D=eER. . cmovael %ebx, %eax
# CHECK-NEXT: [1,0] .D=eER . cmpl %eax, %eax
# CHECK-NEXT: [1,1] .D==eER . cmovael %ebx, %eax
# CHECK-NEXT: [2,0] . D==eER. cmpl %eax, %eax
# CHECK-NEXT: [2,1] . D===eER cmovael %ebx, %eax
# CHECK: [0,0] DeER .. cmpl %eax, %eax
# CHECK-NEXT: [0,1] D=eER.. cmovael %ebx, %eax
# CHECK-NEXT: [1,0] .DeER.. cmpl %eax, %eax
# CHECK-NEXT: [1,1] .D=eER. cmovael %ebx, %eax
# CHECK-NEXT: [2,0] . DeER. cmpl %eax, %eax
# CHECK-NEXT: [2,1] . D=eER cmovael %ebx, %eax

# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
Expand All @@ -70,5 +70,5 @@ cmovae %ebx, %eax
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 3 2.0 0.3 0.0 cmpl %eax, %eax
# CHECK-NEXT: 1. 3 3.0 0.0 0.0 cmovael %ebx, %eax
# CHECK-NEXT: 0. 3 1.0 1.0 0.0 cmpl %eax, %eax
# CHECK-NEXT: 1. 3 2.0 0.0 0.0 cmovael %ebx, %eax
39 changes: 19 additions & 20 deletions llvm/test/tools/llvm-mca/X86/BtVer2/dependency-breaking-pcmpeq.s
Expand Up @@ -14,9 +14,9 @@ vpcmpeqq %xmm3, %xmm3, %xmm0

# CHECK: Iterations: 1500
# CHECK-NEXT: Instructions: 6000
# CHECK-NEXT: Total Cycles: 6003
# CHECK-NEXT: Total Cycles: 3003
# CHECK-NEXT: Dispatch Width: 2
# CHECK-NEXT: IPC: 1.00
# CHECK-NEXT: IPC: 2.00
# CHECK-NEXT: Block RThroughput: 2.0

# CHECK: Instruction Info:
Expand Down Expand Up @@ -61,21 +61,20 @@ vpcmpeqq %xmm3, %xmm3, %xmm0
# CHECK-NEXT: - - - - - 1.00 - - - - - 1.00 - - vpcmpeqq %xmm3, %xmm3, %xmm0

# CHECK: Timeline view:
# CHECK-NEXT: 01234
# CHECK-NEXT: Index 0123456789
# CHECK-NEXT: Index 012345678

# CHECK: [0,0] DeER . . . vpcmpeqb %xmm0, %xmm0, %xmm1
# CHECK-NEXT: [0,1] D=eER. . . vpcmpeqw %xmm1, %xmm1, %xmm2
# CHECK-NEXT: [0,2] .D=eER . . vpcmpeqd %xmm2, %xmm2, %xmm3
# CHECK-NEXT: [0,3] .D==eER . . vpcmpeqq %xmm3, %xmm3, %xmm0
# CHECK-NEXT: [1,0] . D==eER . . vpcmpeqb %xmm0, %xmm0, %xmm1
# CHECK-NEXT: [1,1] . D===eER . . vpcmpeqw %xmm1, %xmm1, %xmm2
# CHECK-NEXT: [1,2] . D===eER. . vpcmpeqd %xmm2, %xmm2, %xmm3
# CHECK-NEXT: [1,3] . D====eER . vpcmpeqq %xmm3, %xmm3, %xmm0
# CHECK-NEXT: [2,0] . D====eER . vpcmpeqb %xmm0, %xmm0, %xmm1
# CHECK-NEXT: [2,1] . D=====eER . vpcmpeqw %xmm1, %xmm1, %xmm2
# CHECK-NEXT: [2,2] . D=====eER. vpcmpeqd %xmm2, %xmm2, %xmm3
# CHECK-NEXT: [2,3] . D======eER vpcmpeqq %xmm3, %xmm3, %xmm0
# CHECK: [0,0] DeER . . vpcmpeqb %xmm0, %xmm0, %xmm1
# CHECK-NEXT: [0,1] DeER . . vpcmpeqw %xmm1, %xmm1, %xmm2
# CHECK-NEXT: [0,2] .DeER. . vpcmpeqd %xmm2, %xmm2, %xmm3
# CHECK-NEXT: [0,3] .DeER. . vpcmpeqq %xmm3, %xmm3, %xmm0
# CHECK-NEXT: [1,0] . DeER . vpcmpeqb %xmm0, %xmm0, %xmm1
# CHECK-NEXT: [1,1] . DeER . vpcmpeqw %xmm1, %xmm1, %xmm2
# CHECK-NEXT: [1,2] . DeER . vpcmpeqd %xmm2, %xmm2, %xmm3
# CHECK-NEXT: [1,3] . DeER . vpcmpeqq %xmm3, %xmm3, %xmm0
# CHECK-NEXT: [2,0] . DeER. vpcmpeqb %xmm0, %xmm0, %xmm1
# CHECK-NEXT: [2,1] . DeER. vpcmpeqw %xmm1, %xmm1, %xmm2
# CHECK-NEXT: [2,2] . DeER vpcmpeqd %xmm2, %xmm2, %xmm3
# CHECK-NEXT: [2,3] . DeER vpcmpeqq %xmm3, %xmm3, %xmm0

# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
Expand All @@ -84,7 +83,7 @@ vpcmpeqq %xmm3, %xmm3, %xmm0
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 3 3.0 0.3 0.0 vpcmpeqb %xmm0, %xmm0, %xmm1
# CHECK-NEXT: 1. 3 4.0 0.0 0.0 vpcmpeqw %xmm1, %xmm1, %xmm2
# CHECK-NEXT: 2. 3 4.0 0.0 0.0 vpcmpeqd %xmm2, %xmm2, %xmm3
# CHECK-NEXT: 3. 3 5.0 0.0 0.0 vpcmpeqq %xmm3, %xmm3, %xmm0
# CHECK-NEXT: 0. 3 1.0 1.0 0.0 vpcmpeqb %xmm0, %xmm0, %xmm1
# CHECK-NEXT: 1. 3 1.0 1.0 0.0 vpcmpeqw %xmm1, %xmm1, %xmm2
# CHECK-NEXT: 2. 3 1.0 1.0 0.0 vpcmpeqd %xmm2, %xmm2, %xmm3
# CHECK-NEXT: 3. 3 1.0 1.0 0.0 vpcmpeqq %xmm3, %xmm3, %xmm0
36 changes: 18 additions & 18 deletions llvm/test/tools/llvm-mca/X86/BtVer2/dependency-breaking-sbb-2.s
Expand Up @@ -13,9 +13,9 @@ sbb %eax, %eax

# CHECK: Iterations: 1500
# CHECK-NEXT: Instructions: 4500
# CHECK-NEXT: Total Cycles: 6745
# CHECK-NEXT: Total Cycles: 3007
# CHECK-NEXT: Dispatch Width: 2
# CHECK-NEXT: IPC: 0.67
# CHECK-NEXT: IPC: 1.50
# CHECK-NEXT: Block RThroughput: 2.0

# CHECK: Instruction Info:
Expand Down Expand Up @@ -49,27 +49,27 @@ sbb %eax, %eax

# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
# CHECK-NEXT: 2.01 1.99 - - - - - - 1.00 - - - - -
# CHECK-NEXT: 2.00 2.00 - - - - - - 1.00 - - - - -

# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
# CHECK-NEXT: - 1.00 - - - - - - 1.00 - - - - - imull %edx, %eax
# CHECK-NEXT: 0.99 0.01 - - - - - - - - - - - - addl %edx, %edx
# CHECK-NEXT: 1.01 0.99 - - - - - - - - - - - - sbbl %eax, %eax
# CHECK-NEXT: - 1.00 - - - - - - - - - - - - addl %edx, %edx
# CHECK-NEXT: 2.00 - - - - - - - - - - - - - sbbl %eax, %eax

# CHECK: Timeline view:
# CHECK-NEXT: 012345
# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789

# CHECK: [0,0] DeeeER . . imull %edx, %eax
# CHECK-NEXT: [0,1] .DeE-R . . addl %edx, %edx
# CHECK-NEXT: [0,2] .D==eER . . sbbl %eax, %eax
# CHECK-NEXT: [1,0] . D===eeeER . imull %edx, %eax
# CHECK-NEXT: [1,1] . DeE----R . addl %edx, %edx
# CHECK-NEXT: [1,2] . D=====eER . sbbl %eax, %eax
# CHECK-NEXT: [2,0] . D=====eeeER. imull %edx, %eax
# CHECK-NEXT: [2,1] . DeE------R. addl %edx, %edx
# CHECK-NEXT: [2,2] . D=======eER sbbl %eax, %eax
# CHECK: [0,0] DeeeER .. imull %edx, %eax
# CHECK-NEXT: [0,1] .DeE-R .. addl %edx, %edx
# CHECK-NEXT: [0,2] .D=eE-R .. sbbl %eax, %eax
# CHECK-NEXT: [1,0] . D==eeeER.. imull %edx, %eax
# CHECK-NEXT: [1,1] . DeE---R.. addl %edx, %edx
# CHECK-NEXT: [1,2] . D=eE---R. sbbl %eax, %eax
# CHECK-NEXT: [2,0] . D=eeeER. imull %edx, %eax
# CHECK-NEXT: [2,1] . D=eE--R addl %edx, %edx
# CHECK-NEXT: [2,2] . D==eE-R sbbl %eax, %eax

# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
Expand All @@ -78,6 +78,6 @@ sbb %eax, %eax
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 3 3.7 0.7 0.0 imull %edx, %eax
# CHECK-NEXT: 1. 3 1.0 1.0 3.7 addl %edx, %edx
# CHECK-NEXT: 2. 3 5.7 0.0 0.0 sbbl %eax, %eax
# CHECK-NEXT: 0. 3 2.0 0.7 0.0 imull %edx, %eax
# CHECK-NEXT: 1. 3 1.3 1.3 2.0 addl %edx, %edx
# CHECK-NEXT: 2. 3 2.3 0.0 1.7 sbbl %eax, %eax

0 comments on commit a1852b6

Please sign in to comment.