Skip to content

Commit

Permalink
[x86] avoid false dependency stall on 'sbb' with same source reg
Browse files Browse the repository at this point in the history
This is effectively inverting the transform added with D116804
because the downside of the false dependency of something like
"sbb %eax, %eax" is much greater than the upside of eliminating
a zeroing instruction on (all?) Intel CPUs.

Differential Revision: https://reviews.llvm.org/D118843
  • Loading branch information
rotateright committed Feb 7, 2022
1 parent 3c33b20 commit 40a50f8
Show file tree
Hide file tree
Showing 20 changed files with 521 additions and 387 deletions.
13 changes: 11 additions & 2 deletions llvm/lib/Target/X86/X86.td
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,10 @@ def TuningLZCNTFalseDeps : SubtargetFeature<"false-deps-lzcnt-tzcnt",
"HasLZCNTFalseDeps", "true",
"LZCNT/TZCNT have a false dependency on dest register">;

def TuningSBBDepBreaking : SubtargetFeature<"sbb-dep-breaking",
"HasSBBDepBreaking", "true",
"SBB with same register has no source dependency">;

// On recent X86 (port bound) processors, its preferable to combine to a single shuffle
// using a variable mask over multiple fixed shuffles.
def TuningFastVariableCrossLaneShuffle
Expand Down Expand Up @@ -1032,6 +1036,7 @@ def ProcessorFeatures {
Feature64Bit];
list<SubtargetFeature> BarcelonaTuning = [TuningFastScalarShiftMasks,
TuningSlowSHLD,
TuningSBBDepBreaking,
TuningInsertVZEROUPPER];

// Bobcat
Expand All @@ -1053,6 +1058,7 @@ def ProcessorFeatures {
TuningFastScalarShiftMasks,
TuningFastVectorShiftMasks,
TuningSlowSHLD,
TuningSBBDepBreaking,
TuningInsertVZEROUPPER];

// Jaguar
Expand All @@ -1072,6 +1078,7 @@ def ProcessorFeatures {
TuningFastScalarShiftMasks,
TuningFastVectorShiftMasks,
TuningFastMOVBE,
TuningSBBDepBreaking,
TuningSlowSHLD];
list<SubtargetFeature> BtVer2Features =
!listconcat(BtVer1Features, BtVer2AdditionalFeatures);
Expand Down Expand Up @@ -1099,6 +1106,7 @@ def ProcessorFeatures {
TuningFast11ByteNOP,
TuningFastScalarShiftMasks,
TuningBranchFusion,
TuningSBBDepBreaking,
TuningInsertVZEROUPPER];

// PileDriver
Expand Down Expand Up @@ -1174,6 +1182,7 @@ def ProcessorFeatures {
TuningFastScalarShiftMasks,
TuningFastMOVBE,
TuningSlowSHLD,
TuningSBBDepBreaking,
TuningInsertVZEROUPPER];
list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
FeatureRDPID,
Expand Down Expand Up @@ -1445,15 +1454,15 @@ foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in {
def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE2, Feature3DNowA,
FeatureFXSR, FeatureNOPL, Feature64Bit, FeatureCMOV],
[TuningFastScalarShiftMasks, TuningSlowSHLD, TuningSlowUAMem16,
TuningInsertVZEROUPPER]>;
TuningSBBDepBreaking, TuningInsertVZEROUPPER]>;
}

foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE3, Feature3DNowA,
FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureCMOV,
Feature64Bit],
[TuningFastScalarShiftMasks, TuningSlowSHLD, TuningSlowUAMem16,
TuningInsertVZEROUPPER]>;
TuningSBBDepBreaking, TuningInsertVZEROUPPER]>;
}

foreach P = ["amdfam10", "barcelona"] in {
Expand Down
42 changes: 27 additions & 15 deletions llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -464,8 +464,13 @@ namespace {
}

// Copy flags to the EFLAGS register and glue it to next node.
SDValue EFLAGS = CurDAG->getCopyToReg(
CurDAG->getEntryNode(), dl, X86::EFLAGS, N->getOperand(2), SDValue());
unsigned Opcode = N->getOpcode();
assert(Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY &&
"Unexpected opcode for SBB materialization");
unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
SDValue EFLAGS =
CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
N->getOperand(FlagOpIndex), SDValue());

// Create a 64-bit instruction if the result is 64-bits otherwise use the
// 32-bit version.
Expand Down Expand Up @@ -5801,21 +5806,28 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
break;

case X86ISD::SETCC_CARRY: {
// We have to do this manually because tblgen will put the eflags copy in
// the wrong place if we use an extract_subreg in the pattern.
MVT VT = Node->getSimpleValueType(0);
SDValue Result;
if (Subtarget->hasSBBDepBreaking()) {
// We have to do this manually because tblgen will put the eflags copy in
// the wrong place if we use an extract_subreg in the pattern.
// Copy flags to the EFLAGS register and glue it to next node.
SDValue EFLAGS =
CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
Node->getOperand(1), SDValue());

// Copy flags to the EFLAGS register and glue it to next node.
SDValue EFLAGS =
CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
Node->getOperand(1), SDValue());

// Create a 64-bit instruction if the result is 64-bits otherwise use the
// 32-bit version.
unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
SDValue Result = SDValue(
CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)), 0);
// Create a 64-bit instruction if the result is 64-bits otherwise use the
// 32-bit version.
unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
Result = SDValue(
CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)),
0);
} else {
// The target does not recognize sbb with the same reg operand as a
// no-source idiom, so we explicitly zero the input values.
Result = getSBBZero(Node);
}

// For less than 32-bits we need to extract from the 32-bit node.
if (VT == MVT::i8 || VT == MVT::i16) {
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/X86/X86Subtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,10 @@ class X86Subtarget final : public X86GenSubtargetInfo {
/// True if LZCNT/TZCNT instructions have a false dependency on the destination register.
bool HasLZCNTFalseDeps = false;

/// True if an SBB instruction with same source register is recognized as
/// having no dependency on that register.
bool HasSBBDepBreaking = false;

/// True if its preferable to combine to a single cross-lane shuffle
/// using a variable mask over multiple fixed shuffles.
bool HasFastVariableCrossLaneShuffle = false;
Expand Down Expand Up @@ -719,6 +723,7 @@ class X86Subtarget final : public X86GenSubtargetInfo {
bool useLeaForSP() const { return UseLeaForSP; }
bool hasPOPCNTFalseDeps() const { return HasPOPCNTFalseDeps; }
bool hasLZCNTFalseDeps() const { return HasLZCNTFalseDeps; }
bool hasSBBDepBreaking() const { return HasSBBDepBreaking; }
bool hasFastVariableCrossLaneShuffle() const {
return HasFastVariableCrossLaneShuffle;
}
Expand Down
12 changes: 7 additions & 5 deletions llvm/test/CodeGen/X86/combine-movmsk-avx.ll
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,9 @@ define i32 @movmskps_concat_v4f32(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: movmskps_concat_v4f32:
; CHECK: # %bb.0:
; CHECK-NEXT: vorps %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vmovmskps %xmm0, %eax
; CHECK-NEXT: negl %eax
; CHECK-NEXT: vmovmskps %xmm0, %ecx
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: negl %ecx
; CHECK-NEXT: sbbl %eax, %eax
; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
Expand All @@ -153,9 +154,10 @@ define i32 @movmskps_concat_v4f32(<4 x float> %a0, <4 x float> %a1) {
define i32 @movmskps_demanded_concat_v4f32(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: movmskps_demanded_concat_v4f32:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovmskps %xmm0, %eax
; CHECK-NEXT: andl $3, %eax
; CHECK-NEXT: negl %eax
; CHECK-NEXT: vmovmskps %xmm0, %ecx
; CHECK-NEXT: andl $3, %ecx
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: negl %ecx
; CHECK-NEXT: sbbl %eax, %eax
; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
Expand Down
10 changes: 7 additions & 3 deletions llvm/test/CodeGen/X86/copy-eflags.ll
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,7 @@ bb1:
define dso_local void @PR37431(i32* %arg1, i8* %arg2, i8* %arg3, i32 %arg4, i64 %arg5) nounwind {
; X32-LABEL: PR37431:
; X32: # %bb.0: # %entry
; X32-NEXT: pushl %ebp
; X32-NEXT: pushl %ebx
; X32-NEXT: pushl %edi
; X32-NEXT: pushl %esi
Expand All @@ -302,10 +303,11 @@ define dso_local void @PR37431(i32* %arg1, i8* %arg2, i8* %arg3, i32 %arg4, i64
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
; X32-NEXT: movl (%edi), %edi
; X32-NEXT: movl %edi, %ebx
; X32-NEXT: sarl $31, %ebx
; X32-NEXT: movl %edi, %ebp
; X32-NEXT: sarl $31, %ebp
; X32-NEXT: xorl %ebx, %ebx
; X32-NEXT: cmpl %edi, {{[0-9]+}}(%esp)
; X32-NEXT: sbbl %ebx, %esi
; X32-NEXT: sbbl %ebp, %esi
; X32-NEXT: sbbl %ebx, %ebx
; X32-NEXT: movb %bl, (%edx)
; X32-NEXT: cltd
Expand All @@ -314,13 +316,15 @@ define dso_local void @PR37431(i32* %arg1, i8* %arg2, i8* %arg3, i32 %arg4, i64
; X32-NEXT: popl %esi
; X32-NEXT: popl %edi
; X32-NEXT: popl %ebx
; X32-NEXT: popl %ebp
; X32-NEXT: retl
;
; X64-LABEL: PR37431:
; X64: # %bb.0: # %entry
; X64-NEXT: movl %ecx, %eax
; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: movslq (%rdi), %rdx
; X64-NEXT: xorl %edi, %edi
; X64-NEXT: cmpq %rdx, %r8
; X64-NEXT: sbbl %edi, %edi
; X64-NEXT: movb %dil, (%rsi)
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/X86/jump_sign.ll
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,7 @@ define i32 @func_q(i32 %a0, i32 %a1, i32 %a2) {
; CHECK-LABEL: func_q:
; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: subl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: sbbl %ecx, %ecx
; CHECK-NEXT: negl %eax
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/CodeGen/X86/machine-cse.ll
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ define i32 @cross_mbb_phys_cse(i32 %a, i32 %b) nounwind ssp {
; CHECK-NEXT: cmpl %esi, %edi
; CHECK-NEXT: ja .LBB2_2
; CHECK-NEXT: # %bb.1: # %if.end
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: cmpl %esi, %edi
; CHECK-NEXT: sbbl %eax, %eax
; CHECK-NEXT: .LBB2_2: # %return
; CHECK-NEXT: retq
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/X86/pr32588.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
define void @fn1() {
; CHECK-LABEL: fn1:
; CHECK: # %bb.0:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: cmpl $1, c(%rip)
; CHECK-NEXT: sbbl %eax, %eax
; CHECK-NEXT: andl $1, %eax
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/X86/pr35972.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ define void @test3(i32 %c, <64 x i1>* %ptr) {
; CHECK-LABEL: test3:
; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: cmpl $1, {{[0-9]+}}(%esp)
; CHECK-NEXT: sbbl %ecx, %ecx
; CHECK-NEXT: kmovd %ecx, %k0
Expand Down
60 changes: 55 additions & 5 deletions llvm/test/CodeGen/X86/sbb-false-dep.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefixes=CHECK
; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefixes=CHECK
; RUN: llc < %s -mtriple=x86_64-- -mattr=+sbb-dep-breaking | FileCheck %s --check-prefixes=IDIOM

%struct.y_s = type { i64*, i64* }

Expand All @@ -24,22 +25,24 @@ define i32 @mallocbench_gs(i32* noundef %0, %struct.y_s* noundef %1, i32 noundef
; CHECK-NEXT: callq foo1@PLT
; CHECK-NEXT: movq 8(%rbx), %rax
; CHECK-NEXT: movq (%rax), %rdx
; CHECK-NEXT: xorl %ebp, %ebp
; CHECK-NEXT: movl %r13d, %ecx
; CHECK-NEXT: negl %ecx
; CHECK-NEXT: sbbq %rbp, %rbp
; CHECK-NEXT: orq %rdx, %rbp
; CHECK-NEXT: cmpl $1, %r13d
; CHECK-NEXT: movl $0, %eax
; CHECK-NEXT: sbbq %rax, %rax
; CHECK-NEXT: orq %rdx, %rax
; CHECK-NEXT: cmpl $1, %r13d
; CHECK-NEXT: sbbq %rbp, %rbp
; CHECK-NEXT: orq %rdx, %rbp
; CHECK-NEXT: subq $8, %rsp
; CHECK-NEXT: movq %r12, %rdi
; CHECK-NEXT: movl %r15d, %esi
; CHECK-NEXT: movl %r14d, %edx
; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: xorl %r8d, %r8d
; CHECK-NEXT: xorl %r9d, %r9d
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: callq foo2@PLT
; CHECK-NEXT: addq $40, %rsp
Expand All @@ -50,6 +53,53 @@ define i32 @mallocbench_gs(i32* noundef %0, %struct.y_s* noundef %1, i32 noundef
; CHECK-NEXT: popq %r15
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: retq
;
; IDIOM-LABEL: mallocbench_gs:
; IDIOM: # %bb.0:
; IDIOM-NEXT: pushq %rbp
; IDIOM-NEXT: pushq %r15
; IDIOM-NEXT: pushq %r14
; IDIOM-NEXT: pushq %r13
; IDIOM-NEXT: pushq %r12
; IDIOM-NEXT: pushq %rbx
; IDIOM-NEXT: pushq %rax
; IDIOM-NEXT: movl %r8d, %r13d
; IDIOM-NEXT: movl %ecx, %r14d
; IDIOM-NEXT: movl %edx, %r15d
; IDIOM-NEXT: movq %rsi, %rbx
; IDIOM-NEXT: movq %rdi, %r12
; IDIOM-NEXT: movq (%rsi), %rdi
; IDIOM-NEXT: movq 8(%rsi), %rsi
; IDIOM-NEXT: movq %rbx, %rdx
; IDIOM-NEXT: callq foo1@PLT
; IDIOM-NEXT: movq 8(%rbx), %rax
; IDIOM-NEXT: movq (%rax), %rdx
; IDIOM-NEXT: movl %r13d, %ecx
; IDIOM-NEXT: negl %ecx
; IDIOM-NEXT: sbbq %rbp, %rbp
; IDIOM-NEXT: orq %rdx, %rbp
; IDIOM-NEXT: cmpl $1, %r13d
; IDIOM-NEXT: sbbq %rax, %rax
; IDIOM-NEXT: orq %rdx, %rax
; IDIOM-NEXT: subq $8, %rsp
; IDIOM-NEXT: movq %r12, %rdi
; IDIOM-NEXT: movl %r15d, %esi
; IDIOM-NEXT: movl %r14d, %edx
; IDIOM-NEXT: xorl %ecx, %ecx
; IDIOM-NEXT: xorl %r8d, %r8d
; IDIOM-NEXT: xorl %r9d, %r9d
; IDIOM-NEXT: pushq %rax
; IDIOM-NEXT: pushq %rbp
; IDIOM-NEXT: pushq %rbx
; IDIOM-NEXT: callq foo2@PLT
; IDIOM-NEXT: addq $40, %rsp
; IDIOM-NEXT: popq %rbx
; IDIOM-NEXT: popq %r12
; IDIOM-NEXT: popq %r13
; IDIOM-NEXT: popq %r14
; IDIOM-NEXT: popq %r15
; IDIOM-NEXT: popq %rbp
; IDIOM-NEXT: retq
%6 = getelementptr inbounds %struct.y_s, %struct.y_s* %1, i64 0, i32 0
%7 = load i64*, i64** %6, align 8
%8 = getelementptr inbounds %struct.y_s, %struct.y_s* %1, i64 0, i32 1
Expand Down
39 changes: 27 additions & 12 deletions llvm/test/CodeGen/X86/sbb-zero-idiom.ll
Original file line number Diff line number Diff line change
@@ -1,18 +1,33 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefixes=CHECK
; RUN: llc < %s -mtriple=x86_64-- -mcpu=sandybridge | FileCheck %s --check-prefixes=CHECK
; RUN: llc < %s -mtriple=x86_64-- -mcpu=skylake | FileCheck %s --check-prefixes=CHECK
; RUN: llc < %s -mtriple=x86_64-- -mcpu=k8 | FileCheck %s --check-prefixes=CHECK
; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver1 | FileCheck %s --check-prefixes=CHECK
; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver2 | FileCheck %s --check-prefixes=CHECK
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 | FileCheck %s --check-prefixes=CHECK

; Check the attribute.

; RUN: llc < %s -mtriple=x86_64-- -mattr=-sbb-dep-breaking | FileCheck %s --check-prefixes=ZERO
; RUN: llc < %s -mtriple=x86_64-- -mattr=+sbb-dep-breaking | FileCheck %s --check-prefixes=IDIOM

; And check that CPUs have included the attribute as expected.

; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefixes=ZERO
; RUN: llc < %s -mtriple=x86_64-- -mcpu=sandybridge | FileCheck %s --check-prefixes=ZERO
; RUN: llc < %s -mtriple=x86_64-- -mcpu=skylake | FileCheck %s --check-prefixes=ZERO
; RUN: llc < %s -mtriple=x86_64-- -mcpu=k8 | FileCheck %s --check-prefixes=IDIOM
; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver1 | FileCheck %s --check-prefixes=IDIOM
; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver2 | FileCheck %s --check-prefixes=IDIOM
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 | FileCheck %s --check-prefixes=IDIOM

define i32 @i32_select_0_or_neg1(i32 %x) {
; CHECK-LABEL: i32_select_0_or_neg1:
; CHECK: # %bb.0:
; CHECK-NEXT: negl %edi
; CHECK-NEXT: sbbl %eax, %eax
; CHECK-NEXT: retq
; ZERO-LABEL: i32_select_0_or_neg1:
; ZERO: # %bb.0:
; ZERO-NEXT: xorl %eax, %eax
; ZERO-NEXT: negl %edi
; ZERO-NEXT: sbbl %eax, %eax
; ZERO-NEXT: retq
;
; IDIOM-LABEL: i32_select_0_or_neg1:
; IDIOM: # %bb.0:
; IDIOM-NEXT: negl %edi
; IDIOM-NEXT: sbbl %eax, %eax
; IDIOM-NEXT: retq
%cmp = icmp ne i32 %x, 0
%sel = select i1 %cmp, i32 -1, i32 0
ret i32 %sel
Expand Down
Loading

0 comments on commit 40a50f8

Please sign in to comment.