Skip to content

Commit

Permalink
[X86] Tune bypassing of slow division for Intel CPUs
Browse files Browse the repository at this point in the history
64-bit integer division in Intel CPUs is extremely slow, much slower
than 32-bit division. On the other hand, 8-bit and 16-bit divisions
aren't any faster. The only important exception is Atom where DIV8
is fastest. Because of that, the patch
1) Enables bypassing of 64-bit division for Atom, Silvermont and
   all big cores.
2) Modifies 64-bit bypassing to use 32-bit division instead of
   16-bit one. This doesn't make the shorter division slower but
   increases chances of taking it. Moreover, it's much more likely
   to prove at compile-time that a value fits 32 bits and doesn't
   require a run-time check (e.g. zext i32 to i64).

Differential Revision: https://reviews.llvm.org/D28196

llvm-svn: 291800
  • Loading branch information
Nikolai Bozhenov committed Jan 12, 2017
1 parent 05b4095 commit 6bdf92c
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 20 deletions.
5 changes: 3 additions & 2 deletions llvm/lib/Target/X86/X86.td
Expand Up @@ -209,9 +209,9 @@ def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb",
"HasSlowDivide32", "true",
"Use 8-bit divide for positive values less than 256">;
def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divw",
def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divl",
"HasSlowDivide64", "true",
"Use 16-bit divide for positive values less than 65536">;
"Use 32-bit divide for positive values less than 2^32">;
def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
"PadShortFunctions", "true",
"Pad short functions">;
Expand Down Expand Up @@ -461,6 +461,7 @@ def SNBFeatures : ProcessorFeatures<[], [
FeatureCMPXCHG16B,
FeaturePOPCNT,
FeatureAES,
FeatureSlowDivide64,
FeaturePCLMUL,
FeatureXSAVE,
FeatureXSAVEOPT,
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -97,12 +97,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());

// Bypass expensive divides on Atom when compiling with O2.
// Bypass expensive divides and use cheaper ones.
if (TM.getOptLevel() >= CodeGenOpt::Default) {
if (Subtarget.hasSlowDivide32())
addBypassSlowDiv(32, 8);
if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
addBypassSlowDiv(64, 16);
addBypassSlowDiv(64, 32);
}

if (Subtarget.isTargetKnownWindowsMSVC() ||
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/X86/X86Subtarget.h
Expand Up @@ -216,7 +216,7 @@ class X86Subtarget final : public X86GenSubtargetInfo {
/// 32-bit divisions and should be used when possible.
bool HasSlowDivide32;

/// True if 16-bit divides are significantly faster than
/// True if 32-bit divides are significantly faster than
/// 64-bit divisions and should be used when possible.
bool HasSlowDivide64;

Expand Down
87 changes: 77 additions & 10 deletions llvm/test/CodeGen/X86/atom-bypass-slow-division-64.ll
@@ -1,14 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mcpu=atom -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
; RUN: llc < %s -mcpu=sandybridge -mtriple=x86_64-unknown-linux-gnu | FileCheck %s -check-prefix=SNB

; Additional tests for 64-bit divide bypass

define i64 @Test_get_quotient(i64 %a, i64 %b) nounwind {
; CHECK-LABEL: Test_get_quotient:
; CHECK: # BB#0:
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
; CHECK-NEXT: orq %rsi, %rax
; CHECK-NEXT: testq $-65536, %rax # imm = 0xFFFF0000
; CHECK-NEXT: testq %rcx, %rax
; CHECK-NEXT: je .LBB0_1
; CHECK-NEXT: # BB#2:
; CHECK-NEXT: movq %rdi, %rax
Expand All @@ -18,9 +20,28 @@ define i64 @Test_get_quotient(i64 %a, i64 %b) nounwind {
; CHECK-NEXT: .LBB0_1:
; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: divw %si
; CHECK-NEXT: movzwl %ax, %eax
; CHECK-NEXT: divl %esi
; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<def>
; CHECK-NEXT: retq
;
; SNB-LABEL: Test_get_quotient:
; SNB: # BB#0:
; SNB-NEXT: movq %rdi, %rax
; SNB-NEXT: orq %rsi, %rax
; SNB-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
; SNB-NEXT: testq %rcx, %rax
; SNB-NEXT: je .LBB0_1
; SNB-NEXT: # BB#2:
; SNB-NEXT: movq %rdi, %rax
; SNB-NEXT: cqto
; SNB-NEXT: idivq %rsi
; SNB-NEXT: retq
; SNB-NEXT: .LBB0_1:
; SNB-NEXT: xorl %edx, %edx
; SNB-NEXT: movl %edi, %eax
; SNB-NEXT: divl %esi
; SNB-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<def>
; SNB-NEXT: retq
%result = sdiv i64 %a, %b
ret i64 %result
}
Expand All @@ -29,8 +50,9 @@ define i64 @Test_get_remainder(i64 %a, i64 %b) nounwind {
; CHECK-LABEL: Test_get_remainder:
; CHECK: # BB#0:
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
; CHECK-NEXT: orq %rsi, %rax
; CHECK-NEXT: testq $-65536, %rax # imm = 0xFFFF0000
; CHECK-NEXT: testq %rcx, %rax
; CHECK-NEXT: je .LBB1_1
; CHECK-NEXT: # BB#2:
; CHECK-NEXT: movq %rdi, %rax
Expand All @@ -41,9 +63,31 @@ define i64 @Test_get_remainder(i64 %a, i64 %b) nounwind {
; CHECK-NEXT: .LBB1_1:
; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: divw %si
; CHECK-NEXT: movzwl %dx, %eax
; CHECK-NEXT: divl %esi
; CHECK-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
; CHECK-NEXT: movq %rdx, %rax
; CHECK-NEXT: retq
;
; SNB-LABEL: Test_get_remainder:
; SNB: # BB#0:
; SNB-NEXT: movq %rdi, %rax
; SNB-NEXT: orq %rsi, %rax
; SNB-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
; SNB-NEXT: testq %rcx, %rax
; SNB-NEXT: je .LBB1_1
; SNB-NEXT: # BB#2:
; SNB-NEXT: movq %rdi, %rax
; SNB-NEXT: cqto
; SNB-NEXT: idivq %rsi
; SNB-NEXT: movq %rdx, %rax
; SNB-NEXT: retq
; SNB-NEXT: .LBB1_1:
; SNB-NEXT: xorl %edx, %edx
; SNB-NEXT: movl %edi, %eax
; SNB-NEXT: divl %esi
; SNB-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
; SNB-NEXT: movq %rdx, %rax
; SNB-NEXT: retq
%result = srem i64 %a, %b
ret i64 %result
}
Expand All @@ -52,8 +96,9 @@ define i64 @Test_get_quotient_and_remainder(i64 %a, i64 %b) nounwind {
; CHECK-LABEL: Test_get_quotient_and_remainder:
; CHECK: # BB#0:
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
; CHECK-NEXT: orq %rsi, %rax
; CHECK-NEXT: testq $-65536, %rax # imm = 0xFFFF0000
; CHECK-NEXT: testq %rcx, %rax
; CHECK-NEXT: je .LBB2_1
; CHECK-NEXT: # BB#2:
; CHECK-NEXT: movq %rdi, %rax
Expand All @@ -64,11 +109,33 @@ define i64 @Test_get_quotient_and_remainder(i64 %a, i64 %b) nounwind {
; CHECK-NEXT: .LBB2_1:
; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: divw %si
; CHECK-NEXT: movzwl %ax, %eax
; CHECK-NEXT: movzwl %dx, %edx
; CHECK-NEXT: divl %esi
; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<def>
; CHECK-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
; CHECK-NEXT: addq %rdx, %rax
; CHECK-NEXT: retq
;
; SNB-LABEL: Test_get_quotient_and_remainder:
; SNB: # BB#0:
; SNB-NEXT: movq %rdi, %rax
; SNB-NEXT: orq %rsi, %rax
; SNB-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
; SNB-NEXT: testq %rcx, %rax
; SNB-NEXT: je .LBB2_1
; SNB-NEXT: # BB#2:
; SNB-NEXT: movq %rdi, %rax
; SNB-NEXT: cqto
; SNB-NEXT: idivq %rsi
; SNB-NEXT: addq %rdx, %rax
; SNB-NEXT: retq
; SNB-NEXT: .LBB2_1:
; SNB-NEXT: xorl %edx, %edx
; SNB-NEXT: movl %edi, %eax
; SNB-NEXT: divl %esi
; SNB-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
; SNB-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<def>
; SNB-NEXT: addq %rdx, %rax
; SNB-NEXT: retq
%resultdiv = sdiv i64 %a, %b
%resultrem = srem i64 %a, %b
%result = add i64 %resultdiv, %resultrem
Expand Down
11 changes: 6 additions & 5 deletions llvm/test/CodeGen/X86/slow-div.ll
@@ -1,5 +1,5 @@
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+idivl-to-divb < %s | FileCheck -check-prefix=DIV32 %s
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+idivq-to-divw < %s | FileCheck -check-prefix=DIV64 %s
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+idivq-to-divl < %s | FileCheck -check-prefix=DIV64 %s

define i32 @div32(i32 %a, i32 %b) {
entry:
Expand All @@ -16,11 +16,12 @@ entry:
define i64 @div64(i64 %a, i64 %b) {
entry:
; DIV32-LABEL: div64:
; DIV32-NOT: divw
; DIV32-NOT: divl
; DIV64-LABEL: div64:
; DIV64: orq %{{.*}}, [[REG:%[a-z]+]]
; DIV64: testq $-65536, [[REG]]
; DIV64: divw
; DIV64-DAG: movabsq $-4294967296, [[REGMSK:%[a-z]+]]
; DIV64-DAG: orq %{{.*}}, [[REG:%[a-z]+]]
; DIV64: testq [[REGMSK]], [[REG]]
; DIV64: divl
%div = sdiv i64 %a, %b
ret i64 %div
}
Expand Down

0 comments on commit 6bdf92c

Please sign in to comment.