[x86] avoid false dependency stall on 'sbb' with same source reg

This is effectively inverting the transform added with D116804 because the downside of the false dependency of something like "sbb %eax, %eax" is much greater than the upside of eliminating a zeroing instruction on (all?) Intel CPUs. Differential Revision: https://reviews.llvm.org/D118843
llvm · Feb 7, 2022 · 40a50f8 · 40a50f8
1 parent 3c33b20
commit 40a50f8
Show file tree

Hide file tree

Showing 20 changed files with 521 additions and 387 deletions.
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
@@ -445,6 +445,10 @@ def TuningLZCNTFalseDeps : SubtargetFeature<"false-deps-lzcnt-tzcnt",
                                      "HasLZCNTFalseDeps", "true",
                                      "LZCNT/TZCNT have a false dependency on dest register">;
 
+def TuningSBBDepBreaking : SubtargetFeature<"sbb-dep-breaking",
+                                     "HasSBBDepBreaking", "true",
+                                     "SBB with same register has no source dependency">;
+
 // On recent X86 (port bound) processors, its preferable to combine to a single shuffle
 // using a variable mask over multiple fixed shuffles.
 def TuningFastVariableCrossLaneShuffle
@@ -1032,6 +1036,7 @@ def ProcessorFeatures {
                                               Feature64Bit];
   list<SubtargetFeature> BarcelonaTuning = [TuningFastScalarShiftMasks,
                                             TuningSlowSHLD,
+                                            TuningSBBDepBreaking,
                                             TuningInsertVZEROUPPER];
 
   // Bobcat
@@ -1053,6 +1058,7 @@ def ProcessorFeatures {
                                          TuningFastScalarShiftMasks,
                                          TuningFastVectorShiftMasks,
                                          TuningSlowSHLD,
+                                         TuningSBBDepBreaking,
                                          TuningInsertVZEROUPPER];
 
   // Jaguar
@@ -1072,6 +1078,7 @@ def ProcessorFeatures {
                                          TuningFastScalarShiftMasks,
                                          TuningFastVectorShiftMasks,
                                          TuningFastMOVBE,
+                                         TuningSBBDepBreaking,
                                          TuningSlowSHLD];
   list<SubtargetFeature> BtVer2Features =
     !listconcat(BtVer1Features, BtVer2AdditionalFeatures);
@@ -1099,6 +1106,7 @@ def ProcessorFeatures {
                                          TuningFast11ByteNOP,
                                          TuningFastScalarShiftMasks,
                                          TuningBranchFusion,
+                                         TuningSBBDepBreaking,
                                          TuningInsertVZEROUPPER];
 
   // PileDriver
@@ -1174,6 +1182,7 @@ def ProcessorFeatures {
                                      TuningFastScalarShiftMasks,
                                      TuningFastMOVBE,
                                      TuningSlowSHLD,
+                                     TuningSBBDepBreaking,
                                      TuningInsertVZEROUPPER];
   list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
                                                   FeatureRDPID,
@@ -1445,15 +1454,15 @@ foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in {
   def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE2, Feature3DNowA,
                  FeatureFXSR, FeatureNOPL, Feature64Bit, FeatureCMOV],
                 [TuningFastScalarShiftMasks, TuningSlowSHLD, TuningSlowUAMem16,
-                 TuningInsertVZEROUPPER]>;
+                 TuningSBBDepBreaking, TuningInsertVZEROUPPER]>;
 }
 
 foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
   def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE3, Feature3DNowA,
                  FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureCMOV,
                  Feature64Bit],
                 [TuningFastScalarShiftMasks, TuningSlowSHLD, TuningSlowUAMem16,
-                 TuningInsertVZEROUPPER]>;
+                 TuningSBBDepBreaking, TuningInsertVZEROUPPER]>;
 }
 
 foreach P = ["amdfam10", "barcelona"] in {

diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -464,8 +464,13 @@ namespace {
       }
 
       // Copy flags to the EFLAGS register and glue it to next node.
-      SDValue EFLAGS = CurDAG->getCopyToReg(
-          CurDAG->getEntryNode(), dl, X86::EFLAGS, N->getOperand(2), SDValue());
+      unsigned Opcode = N->getOpcode();
+      assert(Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY &&
+             "Unexpected opcode for SBB materialization");
+      unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
+      SDValue EFLAGS =
+          CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
+                               N->getOperand(FlagOpIndex), SDValue());
 
       // Create a 64-bit instruction if the result is 64-bits otherwise use the
       // 32-bit version.
@@ -5801,21 +5806,28 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     break;
 
   case X86ISD::SETCC_CARRY: {
-    // We have to do this manually because tblgen will put the eflags copy in
-    // the wrong place if we use an extract_subreg in the pattern.
     MVT VT = Node->getSimpleValueType(0);
+    SDValue Result;
+    if (Subtarget->hasSBBDepBreaking()) {
+      // We have to do this manually because tblgen will put the eflags copy in
+      // the wrong place if we use an extract_subreg in the pattern.
+      // Copy flags to the EFLAGS register and glue it to next node.
+      SDValue EFLAGS =
+          CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
+                               Node->getOperand(1), SDValue());
 
-    // Copy flags to the EFLAGS register and glue it to next node.
-    SDValue EFLAGS =
-        CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
-                             Node->getOperand(1), SDValue());
-
-    // Create a 64-bit instruction if the result is 64-bits otherwise use the
-    // 32-bit version.
-    unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
-    MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
-    SDValue Result = SDValue(
-        CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)), 0);
+      // Create a 64-bit instruction if the result is 64-bits otherwise use the
+      // 32-bit version.
+      unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
+      MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
+      Result = SDValue(
+          CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)),
+          0);
+    } else {
+      // The target does not recognize sbb with the same reg operand as a
+      // no-source idiom, so we explicitly zero the input values.
+      Result = getSBBZero(Node);
+    }
 
     // For less than 32-bits we need to extract from the 32-bit node.
     if (VT == MVT::i8 || VT == MVT::i16) {

diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
@@ -246,6 +246,10 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   /// True if LZCNT/TZCNT instructions have a false dependency on the destination register.
   bool HasLZCNTFalseDeps = false;
 
+  /// True if an SBB instruction with same source register is recognized as
+  /// having no dependency on that register.
+  bool HasSBBDepBreaking = false;
+
   /// True if its preferable to combine to a single cross-lane shuffle
   /// using a variable mask over multiple fixed shuffles.
   bool HasFastVariableCrossLaneShuffle = false;
@@ -719,6 +723,7 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   bool useLeaForSP() const { return UseLeaForSP; }
   bool hasPOPCNTFalseDeps() const { return HasPOPCNTFalseDeps; }
   bool hasLZCNTFalseDeps() const { return HasLZCNTFalseDeps; }
+  bool hasSBBDepBreaking() const { return HasSBBDepBreaking; }
   bool hasFastVariableCrossLaneShuffle() const {
     return HasFastVariableCrossLaneShuffle;
   }

diff --git a/llvm/test/CodeGen/X86/combine-movmsk-avx.ll b/llvm/test/CodeGen/X86/combine-movmsk-avx.ll
@@ -139,8 +139,9 @@ define i32 @movmskps_concat_v4f32(<4 x float> %a0, <4 x float> %a1)  {
 ; CHECK-LABEL: movmskps_concat_v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vorps %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vmovmskps %xmm0, %eax
-; CHECK-NEXT:    negl %eax
+; CHECK-NEXT:    vmovmskps %xmm0, %ecx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    negl %ecx
 ; CHECK-NEXT:    sbbl %eax, %eax
 ; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -153,9 +154,10 @@ define i32 @movmskps_concat_v4f32(<4 x float> %a0, <4 x float> %a1)  {
 define i32 @movmskps_demanded_concat_v4f32(<4 x float> %a0, <4 x float> %a1)  {
 ; CHECK-LABEL: movmskps_demanded_concat_v4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovmskps %xmm0, %eax
-; CHECK-NEXT:    andl $3, %eax
-; CHECK-NEXT:    negl %eax
+; CHECK-NEXT:    vmovmskps %xmm0, %ecx
+; CHECK-NEXT:    andl $3, %ecx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    negl %ecx
 ; CHECK-NEXT:    sbbl %eax, %eax
 ; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>

diff --git a/llvm/test/CodeGen/X86/copy-eflags.ll b/llvm/test/CodeGen/X86/copy-eflags.ll
@@ -293,6 +293,7 @@ bb1:
 define dso_local void @PR37431(i32* %arg1, i8* %arg2, i8* %arg3, i32 %arg4, i64 %arg5) nounwind {
 ; X32-LABEL: PR37431:
 ; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %ebp
 ; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    pushl %edi
 ; X32-NEXT:    pushl %esi
@@ -302,10 +303,11 @@ define dso_local void @PR37431(i32* %arg1, i8* %arg2, i8* %arg3, i32 %arg4, i64
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    movl (%edi), %edi
-; X32-NEXT:    movl %edi, %ebx
-; X32-NEXT:    sarl $31, %ebx
+; X32-NEXT:    movl %edi, %ebp
+; X32-NEXT:    sarl $31, %ebp
+; X32-NEXT:    xorl %ebx, %ebx
 ; X32-NEXT:    cmpl %edi, {{[0-9]+}}(%esp)
-; X32-NEXT:    sbbl %ebx, %esi
+; X32-NEXT:    sbbl %ebp, %esi
 ; X32-NEXT:    sbbl %ebx, %ebx
 ; X32-NEXT:    movb %bl, (%edx)
 ; X32-NEXT:    cltd
@@ -314,13 +316,15 @@ define dso_local void @PR37431(i32* %arg1, i8* %arg2, i8* %arg3, i32 %arg4, i64
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %edi
 ; X32-NEXT:    popl %ebx
+; X32-NEXT:    popl %ebp
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: PR37431:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movl %ecx, %eax
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movslq (%rdi), %rdx
+; X64-NEXT:    xorl %edi, %edi
 ; X64-NEXT:    cmpq %rdx, %r8
 ; X64-NEXT:    sbbl %edi, %edi
 ; X64-NEXT:    movb %dil, (%rsi)

diff --git a/llvm/test/CodeGen/X86/jump_sign.ll b/llvm/test/CodeGen/X86/jump_sign.ll
@@ -310,6 +310,7 @@ define i32 @func_q(i32 %a0, i32 %a1, i32 %a2) {
 ; CHECK-LABEL: func_q:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    subl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    sbbl %ecx, %ecx
 ; CHECK-NEXT:    negl %eax

diff --git a/llvm/test/CodeGen/X86/machine-cse.ll b/llvm/test/CodeGen/X86/machine-cse.ll
@@ -112,6 +112,8 @@ define i32 @cross_mbb_phys_cse(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    cmpl %esi, %edi
 ; CHECK-NEXT:    ja .LBB2_2
 ; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    cmpl %esi, %edi
 ; CHECK-NEXT:    sbbl %eax, %eax
 ; CHECK-NEXT:  .LBB2_2: # %return
 ; CHECK-NEXT:    retq

diff --git a/llvm/test/CodeGen/X86/pr32588.ll b/llvm/test/CodeGen/X86/pr32588.ll
@@ -8,6 +8,7 @@
 define void @fn1() {
 ; CHECK-LABEL: fn1:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    cmpl $1, c(%rip)
 ; CHECK-NEXT:    sbbl %eax, %eax
 ; CHECK-NEXT:    andl $1, %eax

diff --git a/llvm/test/CodeGen/X86/pr35972.ll b/llvm/test/CodeGen/X86/pr35972.ll
@@ -5,6 +5,7 @@ define void @test3(i32 %c, <64 x i1>* %ptr) {
 ; CHECK-LABEL: test3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    cmpl $1, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    sbbl %ecx, %ecx
 ; CHECK-NEXT:    kmovd %ecx, %k0

diff --git a/llvm/test/CodeGen/X86/sbb-false-dep.ll b/llvm/test/CodeGen/X86/sbb-false-dep.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -mtriple=x86_64--                          | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sbb-dep-breaking | FileCheck %s --check-prefixes=IDIOM
 
 %struct.y_s = type { i64*, i64* }
 
@@ -24,22 +25,24 @@ define i32 @mallocbench_gs(i32* noundef %0, %struct.y_s* noundef %1, i32 noundef
 ; CHECK-NEXT:    callq foo1@PLT
 ; CHECK-NEXT:    movq 8(%rbx), %rax
 ; CHECK-NEXT:    movq (%rax), %rdx
+; CHECK-NEXT:    xorl %ebp, %ebp
 ; CHECK-NEXT:    movl %r13d, %ecx
 ; CHECK-NEXT:    negl %ecx
-; CHECK-NEXT:    sbbq %rbp, %rbp
-; CHECK-NEXT:    orq %rdx, %rbp
-; CHECK-NEXT:    cmpl $1, %r13d
+; CHECK-NEXT:    movl $0, %eax
 ; CHECK-NEXT:    sbbq %rax, %rax
 ; CHECK-NEXT:    orq %rdx, %rax
+; CHECK-NEXT:    cmpl $1, %r13d
+; CHECK-NEXT:    sbbq %rbp, %rbp
+; CHECK-NEXT:    orq %rdx, %rbp
 ; CHECK-NEXT:    subq $8, %rsp
 ; CHECK-NEXT:    movq %r12, %rdi
 ; CHECK-NEXT:    movl %r15d, %esi
 ; CHECK-NEXT:    movl %r14d, %edx
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    xorl %r8d, %r8d
 ; CHECK-NEXT:    xorl %r9d, %r9d
-; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    callq foo2@PLT
 ; CHECK-NEXT:    addq $40, %rsp
@@ -50,6 +53,53 @@ define i32 @mallocbench_gs(i32* noundef %0, %struct.y_s* noundef %1, i32 noundef
 ; CHECK-NEXT:    popq %r15
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    retq
+;
+; IDIOM-LABEL: mallocbench_gs:
+; IDIOM:       # %bb.0:
+; IDIOM-NEXT:    pushq %rbp
+; IDIOM-NEXT:    pushq %r15
+; IDIOM-NEXT:    pushq %r14
+; IDIOM-NEXT:    pushq %r13
+; IDIOM-NEXT:    pushq %r12
+; IDIOM-NEXT:    pushq %rbx
+; IDIOM-NEXT:    pushq %rax
+; IDIOM-NEXT:    movl %r8d, %r13d
+; IDIOM-NEXT:    movl %ecx, %r14d
+; IDIOM-NEXT:    movl %edx, %r15d
+; IDIOM-NEXT:    movq %rsi, %rbx
+; IDIOM-NEXT:    movq %rdi, %r12
+; IDIOM-NEXT:    movq (%rsi), %rdi
+; IDIOM-NEXT:    movq 8(%rsi), %rsi
+; IDIOM-NEXT:    movq %rbx, %rdx
+; IDIOM-NEXT:    callq foo1@PLT
+; IDIOM-NEXT:    movq 8(%rbx), %rax
+; IDIOM-NEXT:    movq (%rax), %rdx
+; IDIOM-NEXT:    movl %r13d, %ecx
+; IDIOM-NEXT:    negl %ecx
+; IDIOM-NEXT:    sbbq %rbp, %rbp
+; IDIOM-NEXT:    orq %rdx, %rbp
+; IDIOM-NEXT:    cmpl $1, %r13d
+; IDIOM-NEXT:    sbbq %rax, %rax
+; IDIOM-NEXT:    orq %rdx, %rax
+; IDIOM-NEXT:    subq $8, %rsp
+; IDIOM-NEXT:    movq %r12, %rdi
+; IDIOM-NEXT:    movl %r15d, %esi
+; IDIOM-NEXT:    movl %r14d, %edx
+; IDIOM-NEXT:    xorl %ecx, %ecx
+; IDIOM-NEXT:    xorl %r8d, %r8d
+; IDIOM-NEXT:    xorl %r9d, %r9d
+; IDIOM-NEXT:    pushq %rax
+; IDIOM-NEXT:    pushq %rbp
+; IDIOM-NEXT:    pushq %rbx
+; IDIOM-NEXT:    callq foo2@PLT
+; IDIOM-NEXT:    addq $40, %rsp
+; IDIOM-NEXT:    popq %rbx
+; IDIOM-NEXT:    popq %r12
+; IDIOM-NEXT:    popq %r13
+; IDIOM-NEXT:    popq %r14
+; IDIOM-NEXT:    popq %r15
+; IDIOM-NEXT:    popq %rbp
+; IDIOM-NEXT:    retq
   %6 = getelementptr inbounds %struct.y_s, %struct.y_s* %1, i64 0, i32 0
   %7 = load i64*, i64** %6, align 8
   %8 = getelementptr inbounds %struct.y_s, %struct.y_s* %1, i64 0, i32 1

diff --git a/llvm/test/CodeGen/X86/sbb-zero-idiom.ll b/llvm/test/CodeGen/X86/sbb-zero-idiom.ll
@@ -1,18 +1,33 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64--                   | FileCheck %s --check-prefixes=CHECK
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=sandybridge | FileCheck %s --check-prefixes=CHECK
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=skylake     | FileCheck %s --check-prefixes=CHECK
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=k8          | FileCheck %s --check-prefixes=CHECK
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver1      | FileCheck %s --check-prefixes=CHECK
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver2      | FileCheck %s --check-prefixes=CHECK
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3      | FileCheck %s --check-prefixes=CHECK
+
+; Check the attribute.
+
+; RUN: llc < %s -mtriple=x86_64-- -mattr=-sbb-dep-breaking | FileCheck %s --check-prefixes=ZERO
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sbb-dep-breaking | FileCheck %s --check-prefixes=IDIOM
+
+; And check that CPUs have included the attribute as expected.
+
+; RUN: llc < %s -mtriple=x86_64--                   | FileCheck %s --check-prefixes=ZERO
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=sandybridge | FileCheck %s --check-prefixes=ZERO
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=skylake     | FileCheck %s --check-prefixes=ZERO
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=k8          | FileCheck %s --check-prefixes=IDIOM
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver1      | FileCheck %s --check-prefixes=IDIOM
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver2      | FileCheck %s --check-prefixes=IDIOM
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3      | FileCheck %s --check-prefixes=IDIOM
 
 define i32 @i32_select_0_or_neg1(i32 %x) {
-; CHECK-LABEL: i32_select_0_or_neg1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    negl %edi
-; CHECK-NEXT:    sbbl %eax, %eax
-; CHECK-NEXT:    retq
+; ZERO-LABEL: i32_select_0_or_neg1:
+; ZERO:       # %bb.0:
+; ZERO-NEXT:    xorl %eax, %eax
+; ZERO-NEXT:    negl %edi
+; ZERO-NEXT:    sbbl %eax, %eax
+; ZERO-NEXT:    retq
+;
+; IDIOM-LABEL: i32_select_0_or_neg1:
+; IDIOM:       # %bb.0:
+; IDIOM-NEXT:    negl %edi
+; IDIOM-NEXT:    sbbl %eax, %eax
+; IDIOM-NEXT:    retq
   %cmp = icmp ne i32 %x, 0
   %sel = select i1 %cmp, i32 -1, i32 0
   ret i32 %sel