Skip to content

Commit

Permalink
[X86][SSE] Disable shouldFoldConstantShiftPairToMask for btver1/btver…
Browse files Browse the repository at this point in the history
…2 targets (PR40758)

As detailed on PR40758, Bobcat/Jaguar can perform vector immediate shifts on the same pipes as vector ANDs with the same latency - so it doesn't make sense to replace a shl+lshr with a shift+and pair as it requires an additional mask (with the extra constant pool, loading and register pressure costs).

Differential Revision: https://reviews.llvm.org/D61068

llvm-svn: 359293
  • Loading branch information
RKSimon committed Apr 26, 2019
1 parent 5e161df commit 5d6ef94
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 8 deletions.
3 changes: 3 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Expand Up @@ -6882,6 +6882,8 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
// (and (srl x, (sub c1, c2), MASK)
// Only fold this if the inner shift has no other uses -- if it does, folding
// this will increase the total number of instructions.
// TODO - drop hasOneUse requirement if c1 == c2?
// TODO - support non-uniform vector shift amounts.
if (N1C && N0.getOpcode() == ISD::SRL && N0.hasOneUse() &&
TLI.shouldFoldConstantShiftPairToMask(N, Level)) {
if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
Expand Down Expand Up @@ -7188,6 +7190,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
}

// fold (srl (shl x, c), c) -> (and x, cst2)
// TODO - (srl (shl x, c1), c2).
if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 &&
isConstantOrConstantVector(N1, /* NoOpaques */ true)) {
SDLoc DL(N);
Expand Down
8 changes: 7 additions & 1 deletion llvm/lib/Target/X86/X86.td
Expand Up @@ -424,6 +424,11 @@ def FeatureFastHorizontalOps
"Prefer horizontal vector math instructions (haddp, phsub, etc.) over "
"normal vector instructions with shuffles", [FeatureSSE3]>;

def FeatureFastVectorShiftMasks
: SubtargetFeature<
"fast-vector-shift-masks", "HasFastVectorShiftMasks", "true",
"Prefer a left/right vector logical shift pair over a shift+and pair">;

// Merge branches using three-way conditional code.
def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch",
"ThreewayBranchProfitable", "true",
Expand Down Expand Up @@ -775,7 +780,8 @@ def ProcessorFeatures {
FeaturePOPCNT,
FeatureSlowSHLD,
FeatureLAHFSAHF,
FeatureFast15ByteNOP];
FeatureFast15ByteNOP,
FeatureFastVectorShiftMasks];
list<SubtargetFeature> BtVer1Features = BtVer1InheritableFeatures;

// Jaguar
Expand Down
13 changes: 12 additions & 1 deletion llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -5013,7 +5013,18 @@ bool X86TargetLowering::hasAndNot(SDValue Y) const {

bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
const SDNode *N, CombineLevel Level) const {
// TODO - some targets prefer immediate vector shifts to shift+mask.
assert((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) ||
(N->getOpcode() == ISD::SRL &&
N->getOperand(0).getOpcode() == ISD::SHL) &&
"Expected shift-shift mask");

if (Subtarget.hasFastVectorShiftMasks() && N->getValueType(0).isVector()) {
// Only fold if the shift values are equal - so it folds to AND.
// TODO - we should fold if either is non-uniform but we don't do the
// fold for non-splats yet.
return N->getOperand(1) == N->getOperand(0).getOperand(1);
}
return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
}

Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/X86/X86Subtarget.h
Expand Up @@ -393,6 +393,9 @@ class X86Subtarget final : public X86GenSubtargetInfo {
/// Try harder to combine to horizontal vector ops if they are fast.
bool HasFastHorizontalOps = false;

/// Prefer a left/right vector logical shifts pair over a shift+and pair.
bool HasFastVectorShiftMasks = false;

/// Use a retpoline thunk rather than indirect calls to block speculative
/// execution.
bool UseRetpolineIndirectCalls = false;
Expand Down Expand Up @@ -644,6 +647,7 @@ class X86Subtarget final : public X86GenSubtargetInfo {
bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
bool hasFastBEXTR() const { return HasFastBEXTR; }
bool hasFastHorizontalOps() const { return HasFastHorizontalOps; }
bool hasFastVectorShiftMasks() const { return HasFastVectorShiftMasks; }
bool hasMacroFusion() const { return HasMacroFusion; }
bool hasBranchFusion() const { return HasBranchFusion; }
bool hasERMSB() const { return HasERMSB; }
Expand Down
20 changes: 14 additions & 6 deletions llvm/test/CodeGen/X86/sse2-vector-shifts.ll
@@ -1,5 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,MASK
; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2,+fast-vector-shift-masks | FileCheck %s --check-prefixes=CHECK,SHIFT
; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=btver1 | FileCheck %s --check-prefixes=CHECK,SHIFT

; SSE2 Logical Shift Left

Expand Down Expand Up @@ -300,11 +302,17 @@ define <4 x i32> @shl_sra_v4i32(<4 x i32> %x) nounwind {
}

define <4 x i32> @shl_srl_v4i32(<4 x i32> %x) nounwind {
; CHECK-LABEL: shl_srl_v4i32:
; CHECK: # %bb.0:
; CHECK-NEXT: pslld $3, %xmm0
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
; MASK-LABEL: shl_srl_v4i32:
; MASK: # %bb.0:
; MASK-NEXT: pslld $3, %xmm0
; MASK-NEXT: pand {{.*}}(%rip), %xmm0
; MASK-NEXT: retq
;
; SHIFT-LABEL: shl_srl_v4i32:
; SHIFT: # %bb.0:
; SHIFT-NEXT: psrld $2, %xmm0
; SHIFT-NEXT: pslld $5, %xmm0
; SHIFT-NEXT: retq
%shl0 = lshr <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
%shl1 = shl <4 x i32> %shl0, <i32 5, i32 5, i32 5, i32 5>
ret <4 x i32> %shl1
Expand Down

0 comments on commit 5d6ef94

Please sign in to comment.